This assignment is based off of this 2D object detection tutorial which uses pytorch to implement the SSD network in order to detect objects in images within the VOC Dataset. https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection
First we mount our google drive
from google.colab import drive
drive.mount('/content/gdrive')
Mounted at /content/gdrive
# Go to the your assignment directory
%cd /content/gdrive/MyDrive/Colab Notebooks/ece495_assignment4
# Where am I?
!ls
## You should have this output:
# /content/gdrive/MyDrive/Colab Notebooks/ece495_assignment4
# ece495_assignment4.ipynb utils.py VOC2012 VOC2007
# You might also have the json files if you have already run this
# and also the checkpoint if you have already trained the model
/content/gdrive/MyDrive/Colab Notebooks/ece495_assignment4 checkpoint_ssd300_ResNet.pth.tar __pycache__ utils.py checkpoint_ssd300_VGG.pth.tar TEST_images.json VOC2007 checkpoint_ssd300_VGG_scheduler.pth.tar TEST_objects.json VOC2007_test ece495_assignment4.ipynb TRAIN_images.json label_map.json TRAIN_objects.json
This code only has to be run once. It creates the json files: TRAIN_images.json, TRAIN_objects and label_map.json. These are the image paths, ground truth object information and label to number mapping. This should take 1 hour and 12 minutes.
from utils import create_data_lists
import time
start = time.time()
create_data_lists(voc07_path='/content/gdrive/MyDrive/Colab Notebooks/ece495_assignment4/VOC2007',
voc12_path='/content/gdrive/MyDrive/VOC2012',
output_folder='./')
end = time.time()
print("time elapsed:", end - start)
Next the Dataset loader for VOC is implemented
import torch
from torch.utils.data import Dataset
import json
import os
from PIL import Image
from utils import transform
class PascalVOCDataset(Dataset):
"""
A PyTorch Dataset class to be used in a PyTorch DataLoader to create batches.
"""
def __init__(self, data_folder, split, keep_difficult=False):
"""
:param data_folder: folder where data files are stored
:param split: split, one of 'TRAIN' or 'TEST'
:param keep_difficult: keep or discard objects that are considered difficult to detect?
"""
self.split = split.upper()
assert self.split in {'TRAIN', 'TEST'}
self.data_folder = data_folder
self.keep_difficult = keep_difficult
# Read data files
with open(os.path.join(data_folder, self.split + '_images.json'), 'r') as j:
self.images = json.load(j)
with open(os.path.join(data_folder, self.split + '_objects.json'), 'r') as j:
self.objects = json.load(j)
assert len(self.images) == len(self.objects)
def __getitem__(self, i):
# Read image
image = Image.open(self.images[i], mode='r')
image = image.convert('RGB')
# Read objects in this image (bounding boxes, labels, difficulties)
objects = self.objects[i]
boxes = torch.FloatTensor(objects['boxes']) # (n_objects, 4)
labels = torch.LongTensor(objects['labels']) # (n_objects)
difficulties = torch.ByteTensor(objects['difficulties']) # (n_objects)
# Discard difficult objects, if desired
if not self.keep_difficult:
boxes = boxes[1 - difficulties]
labels = labels[1 - difficulties]
difficulties = difficulties[1 - difficulties]
# Apply transformations
image, boxes, labels, difficulties = transform(image, boxes, labels, difficulties, split=self.split)
return image, boxes, labels, difficulties
def __len__(self):
return len(self.images)
def collate_fn(self, batch):
"""
Since each image may have a different number of objects, we need a collate function (to be passed to the DataLoader).
This describes how to combine these tensors of different sizes. We use lists.
Note: this need not be defined in this Class, can be standalone.
:param batch: an iterable of N sets from __getitem__()
:return: a tensor of images, lists of varying-size tensors of bounding boxes, labels, and difficulties
"""
images = list()
boxes = list()
labels = list()
difficulties = list()
for b in batch:
images.append(b[0])
boxes.append(b[1])
labels.append(b[2])
difficulties.append(b[3])
images = torch.stack(images, dim=0)
return images, boxes, labels, difficulties # tensor (N, 3, 300, 300), 3 lists of N tensors each
First we create the base or encoder part of the network.
You must fill in the ResNet code.
from torch import nn
from utils import *
import torch.nn.functional as F
from math import sqrt
from itertools import product as product
import torchvision
import torchvision.models as models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class VGGBase(nn.Module):
"""
VGG base convolutions to produce lower-level feature maps.
"""
def __init__(self):
super(VGGBase, self).__init__()
# Standard convolutional layers in VGG16
self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1) # stride = 1, by default
self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True) # ceiling (not floor) here for even dims
self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) # retains size because stride is 1 (and padding)
# Replacements for FC6 and FC7 in VGG16
self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) # atrous convolution
self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
# Load pretrained layers
self.load_pretrained_layers()
def forward(self, image):
"""
Forward propagation.
:param image: images, a tensor of dimensions (N, 3, 300, 300)
:return: lower-level feature maps conv4_3 and conv7
"""
out = F.relu(self.conv1_1(image)) # (N, 64, 300, 300)
out = F.relu(self.conv1_2(out)) # (N, 64, 300, 300)
out = self.pool1(out) # (N, 64, 150, 150)
out = F.relu(self.conv2_1(out)) # (N, 128, 150, 150)
out = F.relu(self.conv2_2(out)) # (N, 128, 150, 150)
out = self.pool2(out) # (N, 128, 75, 75)
out = F.relu(self.conv3_1(out)) # (N, 256, 75, 75)
out = F.relu(self.conv3_2(out)) # (N, 256, 75, 75)
out = F.relu(self.conv3_3(out)) # (N, 256, 75, 75)
out = self.pool3(out) # (N, 256, 38, 38), it would have been 37 if not for ceil_mode = True
out = F.relu(self.conv4_1(out)) # (N, 512, 38, 38)
out = F.relu(self.conv4_2(out)) # (N, 512, 38, 38)
out = F.relu(self.conv4_3(out)) # (N, 512, 38, 38)
conv4_3_feats = out # (N, 512, 38, 38)
out = self.pool4(out) # (N, 512, 19, 19)
out = F.relu(self.conv5_1(out)) # (N, 512, 19, 19)
out = F.relu(self.conv5_2(out)) # (N, 512, 19, 19)
out = F.relu(self.conv5_3(out)) # (N, 512, 19, 19)
out = self.pool5(out) # (N, 512, 19, 19), pool5 does not reduce dimensions
out = F.relu(self.conv6(out)) # (N, 1024, 19, 19)
conv7_feats = F.relu(self.conv7(out)) # (N, 1024, 19, 19)
# Lower-level feature maps
return conv4_3_feats, conv7_feats
def load_pretrained_layers(self):
"""
As in the paper, we use a VGG-16 pretrained on the ImageNet task as the base network.
There's one available in PyTorch, see https://pytorch.org/docs/stable/torchvision/models.html#torchvision.models.vgg16
We copy these parameters into our network. It's straightforward for conv1 to conv5.
However, the original VGG-16 does not contain the conv6 and con7 layers.
Therefore, we convert fc6 and fc7 into convolutional layers, and subsample by decimation. See 'decimate' in utils.py.
"""
# Current state of base
state_dict = self.state_dict()
param_names = list(state_dict.keys())
# Pretrained VGG base
pretrained_state_dict = torchvision.models.vgg16(pretrained=True).state_dict()
pretrained_param_names = list(pretrained_state_dict.keys())
# Transfer conv. parameters from pretrained model to current model
for i, param in enumerate(param_names[:-4]): # excluding conv6 and conv7 parameters
state_dict[param] = pretrained_state_dict[pretrained_param_names[i]]
# Convert fc6, fc7 to convolutional layers, and subsample (by decimation) to sizes of conv6 and conv7
# fc6
conv_fc6_weight = pretrained_state_dict['classifier.0.weight'].view(4096, 512, 7, 7) # (4096, 512, 7, 7)
conv_fc6_bias = pretrained_state_dict['classifier.0.bias'] # (4096)
state_dict['conv6.weight'] = decimate(conv_fc6_weight, m=[4, None, 3, 3]) # (1024, 512, 3, 3)
state_dict['conv6.bias'] = decimate(conv_fc6_bias, m=[4]) # (1024)
# fc7
conv_fc7_weight = pretrained_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1) # (4096, 4096, 1, 1)
conv_fc7_bias = pretrained_state_dict['classifier.3.bias'] # (4096)
state_dict['conv7.weight'] = decimate(conv_fc7_weight, m=[4, 4, None, None]) # (1024, 1024, 1, 1)
state_dict['conv7.bias'] = decimate(conv_fc7_bias, m=[4]) # (1024)
# Note: an FC layer of size (K) operating on a flattened version (C*H*W) of a 2D image of size (C, H, W)...
# ...is equivalent to a convolutional layer with kernel size (H, W), input channels C, output channels K...
# ...operating on the 2D image of size (C, H, W) without padding
self.load_state_dict(state_dict)
print("\nLoaded base model.\n")
class ResNetBase(nn.Module):
"""
ResNet base convolutions to produce lower-level feature maps.
"""
def __init__(self):
super(ResNetBase, self).__init__()
# TODO: Load pretrained resnet model
self.resnet = models.resnet34(pretrained=True)
self.bn1 = nn.BatchNorm2d(64)
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) # atrous convolution
self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
def forward(self, image):
"""
Forward propagation.
:param image: images, a tensor of dimensions (N, 3, 300, 300)
:return: lower-level feature maps
"""
# # TODO: Add your code
out = self.resnet.conv1(image)
out = self.bn1(out)
out = self.resnet.relu(out)
out = self.resnet.maxpool(out)
out = self.resnet.layer1(out)
out = self.resnet.layer2(out)
concat_copy = out.clone()
concat_copy_shape = concat_copy.shape
out = self.resnet.layer3(out)
out = self.resnet.layer4(out)
out = F.interpolate(out, size = (concat_copy_shape[2], concat_copy_shape[3]), mode='bilinear', align_corners=True)
conv_512_feats = out
out = self.pool3(out)
out = F.relu(self.conv6(out))
conv_1024_feats = F.relu(self.conv7(out))
# Lower-level feature maps
return conv_512_feats, conv_1024_feats
The base layers created the low level feature maps with 512 and 1024 features. Now the higher level feature maps are created for 512, 256, 256 and 256 feature maps.
class AuxiliaryConvolutions(nn.Module):
"""
Additional convolutions to produce higher-level feature maps.
"""
def __init__(self):
super(AuxiliaryConvolutions, self).__init__()
# Auxiliary/additional convolutions on top of the VGG base
self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1, padding=0) # stride = 1, by default
self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1) # dim. reduction because stride > 1
self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1, padding=0)
self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1) # dim. reduction because stride > 1
self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1, padding=0)
self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3, padding=0) # dim. reduction because padding = 0
self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1, padding=0)
self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3, padding=0) # dim. reduction because padding = 0
# Initialize convolutions' parameters
self.init_conv2d()
def init_conv2d(self):
"""
Initialize convolution parameters.
"""
for c in self.children():
if isinstance(c, nn.Conv2d):
nn.init.xavier_uniform_(c.weight)
nn.init.constant_(c.bias, 0.)
def forward(self, conv7_feats):
"""
Forward propagation.
:param conv7_feats: lower-level conv7 feature map, a tensor of dimensions (N, 1024, 19, 19)
:return: higher-level feature maps conv8_2, conv9_2, conv10_2, and conv11_2
"""
out = F.relu(self.conv8_1(conv7_feats)) # (N, 256, 19, 19)
out = F.relu(self.conv8_2(out)) # (N, 512, 10, 10)
conv8_2_feats = out # (N, 512, 10, 10)
out = F.relu(self.conv9_1(out)) # (N, 128, 10, 10)
out = F.relu(self.conv9_2(out)) # (N, 256, 5, 5)
conv9_2_feats = out # (N, 256, 5, 5)
out = F.relu(self.conv10_1(out)) # (N, 128, 5, 5)
out = F.relu(self.conv10_2(out)) # (N, 256, 3, 3)
conv10_2_feats = out # (N, 256, 3, 3)
out = F.relu(self.conv11_1(out)) # (N, 128, 3, 3)
conv11_2_feats = F.relu(self.conv11_2(out)) # (N, 256, 1, 1)
# Higher-level feature maps
return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats
At this point we have our 6 feature maps.
The low level feature maps: (N, 512, 38, 38), (N, 1024, 19, 19)
Also the high level feature maps: (N, 512, 10, 10), (N, 256, 5, 5), (N, 256, 3, 3), (N, 256, 1, 1)
Each prior box requires a classification output of size number of classes and also the 4 box location values that are regressed. These convolutions are created in the init function.
In the forward pass all the convolutions are performed on their respective input feature maps. After that there is some work done to modify the tensors and then concatonate them in order to have the classification output shaped like (N, 8732, n_classes) and the box output to be (N, 8732, 4). This is a format that will be easier to work with when the network output is passed to the loss function during training or the output is passed through NMS during testing.
class PredictionConvolutions(nn.Module):
"""
Convolutions to predict class scores and bounding boxes using lower and higher-level feature maps.
The bounding boxes (locations) are predicted as encoded offsets w.r.t each of the 8732 prior (default) boxes.
See 'cxcy_to_gcxgcy' in utils.py for the encoding definition.
The class scores represent the scores of each object class in each of the 8732 bounding boxes located.
A high score for 'background' = no object.
"""
def __init__(self, n_classes):
"""
:param n_classes: number of different types of objects
"""
super(PredictionConvolutions, self).__init__()
self.n_classes = n_classes
# Number of prior-boxes we are considering per position in each feature map
n_boxes = {'conv4_3': 4,
'conv7': 6,
'conv8_2': 6,
'conv9_2': 6,
'conv10_2': 4,
'conv11_2': 4}
# 4 prior-boxes implies we use 4 different aspect ratios, etc.
# Localization prediction convolutions (predict offsets w.r.t prior-boxes)
self.loc_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3'] * 4, kernel_size=3, padding=1)
self.loc_conv7 = nn.Conv2d(1024, n_boxes['conv7'] * 4, kernel_size=3, padding=1)
self.loc_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2'] * 4, kernel_size=3, padding=1)
self.loc_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2'] * 4, kernel_size=3, padding=1)
self.loc_conv10_2 = nn.Conv2d(256, n_boxes['conv10_2'] * 4, kernel_size=3, padding=1)
self.loc_conv11_2 = nn.Conv2d(256, n_boxes['conv11_2'] * 4, kernel_size=3, padding=1)
# Class prediction convolutions (predict classes in localization boxes)
self.cl_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3'] * n_classes, kernel_size=3, padding=1)
self.cl_conv7 = nn.Conv2d(1024, n_boxes['conv7'] * n_classes, kernel_size=3, padding=1)
self.cl_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2'] * n_classes, kernel_size=3, padding=1)
self.cl_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2'] * n_classes, kernel_size=3, padding=1)
self.cl_conv10_2 = nn.Conv2d(256, n_boxes['conv10_2'] * n_classes, kernel_size=3, padding=1)
self.cl_conv11_2 = nn.Conv2d(256, n_boxes['conv11_2'] * n_classes, kernel_size=3, padding=1)
# Initialize convolutions' parameters
self.init_conv2d()
def init_conv2d(self):
"""
Initialize convolution parameters.
"""
for c in self.children():
if isinstance(c, nn.Conv2d):
nn.init.xavier_uniform_(c.weight)
nn.init.constant_(c.bias, 0.)
def forward(self, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats):
"""
Forward propagation.
:param conv4_3_feats: conv4_3 feature map, a tensor of dimensions (N, 512, 38, 38)
:param conv7_feats: conv7 feature map, a tensor of dimensions (N, 1024, 19, 19)
:param conv8_2_feats: conv8_2 feature map, a tensor of dimensions (N, 512, 10, 10)
:param conv9_2_feats: conv9_2 feature map, a tensor of dimensions (N, 256, 5, 5)
:param conv10_2_feats: conv10_2 feature map, a tensor of dimensions (N, 256, 3, 3)
:param conv11_2_feats: conv11_2 feature map, a tensor of dimensions (N, 256, 1, 1)
:return: 8732 locations and class scores (i.e. w.r.t each prior box) for each image
"""
batch_size = conv4_3_feats.size(0)
# Predict localization boxes' bounds (as offsets w.r.t prior-boxes)
l_conv4_3 = self.loc_conv4_3(conv4_3_feats) # (N, 16, 38, 38)
l_conv4_3 = l_conv4_3.permute(0, 2, 3,
1).contiguous() # (N, 38, 38, 16), to match prior-box order (after .view())
# (.contiguous() ensures it is stored in a contiguous chunk of memory, needed for .view() below)
l_conv4_3 = l_conv4_3.view(batch_size, -1, 4) # (N, 5776, 4), there are a total 5776 boxes on this feature map
l_conv7 = self.loc_conv7(conv7_feats) # (N, 24, 19, 19)
l_conv7 = l_conv7.permute(0, 2, 3, 1).contiguous() # (N, 19, 19, 24)
l_conv7 = l_conv7.view(batch_size, -1, 4) # (N, 2166, 4), there are a total 2116 boxes on this feature map
l_conv8_2 = self.loc_conv8_2(conv8_2_feats) # (N, 24, 10, 10)
l_conv8_2 = l_conv8_2.permute(0, 2, 3, 1).contiguous() # (N, 10, 10, 24)
l_conv8_2 = l_conv8_2.view(batch_size, -1, 4) # (N, 600, 4)
l_conv9_2 = self.loc_conv9_2(conv9_2_feats) # (N, 24, 5, 5)
l_conv9_2 = l_conv9_2.permute(0, 2, 3, 1).contiguous() # (N, 5, 5, 24)
l_conv9_2 = l_conv9_2.view(batch_size, -1, 4) # (N, 150, 4)
l_conv10_2 = self.loc_conv10_2(conv10_2_feats) # (N, 16, 3, 3)
l_conv10_2 = l_conv10_2.permute(0, 2, 3, 1).contiguous() # (N, 3, 3, 16)
l_conv10_2 = l_conv10_2.view(batch_size, -1, 4) # (N, 36, 4)
l_conv11_2 = self.loc_conv11_2(conv11_2_feats) # (N, 16, 1, 1)
l_conv11_2 = l_conv11_2.permute(0, 2, 3, 1).contiguous() # (N, 1, 1, 16)
l_conv11_2 = l_conv11_2.view(batch_size, -1, 4) # (N, 4, 4)
# Predict classes in localization boxes
c_conv4_3 = self.cl_conv4_3(conv4_3_feats) # (N, 4 * n_classes, 38, 38)
c_conv4_3 = c_conv4_3.permute(0, 2, 3,
1).contiguous() # (N, 38, 38, 4 * n_classes), to match prior-box order (after .view())
c_conv4_3 = c_conv4_3.view(batch_size, -1,
self.n_classes) # (N, 5776, n_classes), there are a total 5776 boxes on this feature map
c_conv7 = self.cl_conv7(conv7_feats) # (N, 6 * n_classes, 19, 19)
c_conv7 = c_conv7.permute(0, 2, 3, 1).contiguous() # (N, 19, 19, 6 * n_classes)
c_conv7 = c_conv7.view(batch_size, -1,
self.n_classes) # (N, 2166, n_classes), there are a total 2116 boxes on this feature map
c_conv8_2 = self.cl_conv8_2(conv8_2_feats) # (N, 6 * n_classes, 10, 10)
c_conv8_2 = c_conv8_2.permute(0, 2, 3, 1).contiguous() # (N, 10, 10, 6 * n_classes)
c_conv8_2 = c_conv8_2.view(batch_size, -1, self.n_classes) # (N, 600, n_classes)
c_conv9_2 = self.cl_conv9_2(conv9_2_feats) # (N, 6 * n_classes, 5, 5)
c_conv9_2 = c_conv9_2.permute(0, 2, 3, 1).contiguous() # (N, 5, 5, 6 * n_classes)
c_conv9_2 = c_conv9_2.view(batch_size, -1, self.n_classes) # (N, 150, n_classes)
c_conv10_2 = self.cl_conv10_2(conv10_2_feats) # (N, 4 * n_classes, 3, 3)
c_conv10_2 = c_conv10_2.permute(0, 2, 3, 1).contiguous() # (N, 3, 3, 4 * n_classes)
c_conv10_2 = c_conv10_2.view(batch_size, -1, self.n_classes) # (N, 36, n_classes)
c_conv11_2 = self.cl_conv11_2(conv11_2_feats) # (N, 4 * n_classes, 1, 1)
c_conv11_2 = c_conv11_2.permute(0, 2, 3, 1).contiguous() # (N, 1, 1, 4 * n_classes)
c_conv11_2 = c_conv11_2.view(batch_size, -1, self.n_classes) # (N, 4, n_classes)
# A total of 8732 boxes
# Concatenate in this specific order (i.e. must match the order of the prior-boxes)
locs = torch.cat([l_conv4_3, l_conv7, l_conv8_2, l_conv9_2, l_conv10_2, l_conv11_2], dim=1) # (N, 8732, 4)
classes_scores = torch.cat([c_conv4_3, c_conv7, c_conv8_2, c_conv9_2, c_conv10_2, c_conv11_2],
dim=1) # (N, 8732, n_classes)
return locs, classes_scores
init - Defines all network layers and created prior boxes
create_prior_boxes - Create 8732 prior boxes across the 6 feature maps
forward - Send the input data through the three network components and then return the predicted locations and classification scores.
detect_objects - After a forward pass the predicted objects can be sent to this function during testing in order to perform NMS for the final output.
What variables within the batch_size for loop represent "D" and "$\bar{B}$"?
The NMS psuedo code is written with operations such as union and set subtraction. Within the NMS python code how are boxes selected in order to be added to the "D" output
class SSD300(nn.Module):
"""
The SSD300 network - encapsulates the base network, auxiliary, and prediction convolutions.
"""
def __init__(self, n_classes, base_type):
super(SSD300, self).__init__()
self.n_classes = n_classes
if base_type == 'VGG':
self.base = VGGBase()
elif base_type == 'ResNet':
self.base = ResNetBase()
else:
raise NotImplementedError
self.aux_convs = AuxiliaryConvolutions()
self.pred_convs = PredictionConvolutions(n_classes)
# Since lower level features (conv4_3_feats) have considerably larger scales, we take the L2 norm and rescale
# Rescale factor is initially set at 20, but is learned for each channel during back-prop
self.rescale_factors = nn.Parameter(torch.FloatTensor(1, 512, 1, 1)) # there are 512 channels in conv4_3_feats
nn.init.constant_(self.rescale_factors, 20)
# Prior boxes
self.priors_cxcy = self.create_prior_boxes()
def forward(self, image):
"""
Forward propagation.
:param image: images, a tensor of dimensions (N, 3, 300, 300)
:return: 8732 locations and class scores (i.e. w.r.t each prior box) for each image
"""
# Run VGG base network convolutions (lower level feature map generators)
conv4_3_feats, conv7_feats = self.base(image) # (N, 512, 38, 38), (N, 1024, 19, 19)
# Rescale conv4_3 after L2 norm
norm = conv4_3_feats.pow(2).sum(dim=1, keepdim=True).sqrt() # (N, 1, 38, 38)
conv4_3_feats = conv4_3_feats / norm # (N, 512, 38, 38)
conv4_3_feats = conv4_3_feats * self.rescale_factors # (N, 512, 38, 38)
# (PyTorch autobroadcasts singleton dimensions during arithmetic)
# Run auxiliary convolutions (higher level feature map generators)
conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats = \
self.aux_convs(conv7_feats) # (N, 512, 10, 10), (N, 256, 5, 5), (N, 256, 3, 3), (N, 256, 1, 1)
# Run prediction convolutions (predict offsets w.r.t prior-boxes and classes in each resulting localization box)
locs, classes_scores = self.pred_convs(conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats,
conv11_2_feats) # (N, 8732, 4), (N, 8732, n_classes)
return locs, classes_scores
def create_prior_boxes(self):
"""
Create the 8732 prior (default) boxes for the SSD300, as defined in the paper.
:return: prior boxes in center-size coordinates, a tensor of dimensions (8732, 4)
"""
fmap_dims = {'conv4_3': 38,
'conv7': 19,
'conv8_2': 10,
'conv9_2': 5,
'conv10_2': 3,
'conv11_2': 1}
obj_scales = {'conv4_3': 0.1,
'conv7': 0.2,
'conv8_2': 0.375,
'conv9_2': 0.55,
'conv10_2': 0.725,
'conv11_2': 0.9}
aspect_ratios = {'conv4_3': [1., 2., 0.5],
'conv7': [1., 2., 3., 0.5, .333],
'conv8_2': [1., 2., 3., 0.5, .333],
'conv9_2': [1., 2., 3., 0.5, .333],
'conv10_2': [1., 2., 0.5],
'conv11_2': [1., 2., 0.5]}
fmaps = list(fmap_dims.keys())
prior_boxes = []
for k, fmap in enumerate(fmaps):
for i in range(fmap_dims[fmap]):
for j in range(fmap_dims[fmap]):
cx = (j + 0.5) / fmap_dims[fmap]
cy = (i + 0.5) / fmap_dims[fmap]
for ratio in aspect_ratios[fmap]:
prior_boxes.append([cx, cy, obj_scales[fmap] * sqrt(ratio), obj_scales[fmap] / sqrt(ratio)])
# For an aspect ratio of 1, use an additional prior whose scale is the geometric mean of the
# scale of the current feature map and the scale of the next feature map
if ratio == 1.:
try:
additional_scale = sqrt(obj_scales[fmap] * obj_scales[fmaps[k + 1]])
# For the last feature map, there is no "next" feature map
except IndexError:
additional_scale = 1.
prior_boxes.append([cx, cy, additional_scale, additional_scale])
prior_boxes = torch.FloatTensor(prior_boxes).to(device) # (8732, 4)
prior_boxes.clamp_(0, 1) # (8732, 4)
return prior_boxes
def detect_objects(self, predicted_locs, predicted_scores, min_score, max_overlap, top_k):
"""
Decipher the 8732 locations and class scores (output of ths SSD300) to detect objects.
For each class, perform Non-Maximum Suppression (NMS) on boxes that are above a minimum threshold.
:param predicted_locs: predicted locations/boxes w.r.t the 8732 prior boxes, a tensor of dimensions (N, 8732, 4)
:param predicted_scores: class scores for each of the encoded locations/boxes, a tensor of dimensions (N, 8732, n_classes)
:param min_score: minimum threshold for a box to be considered a match for a certain class
:param max_overlap: maximum overlap two boxes can have so that the one with the lower score is not suppressed via NMS
:param top_k: if there are a lot of resulting detection across all classes, keep only the top 'k'
:return: detections (boxes, labels, and scores), lists of length batch_size
"""
batch_size = predicted_locs.size(0)
n_priors = self.priors_cxcy.size(0)
predicted_scores = F.softmax(predicted_scores, dim=2) # (N, 8732, n_classes)
# Lists to store final predicted boxes, labels, and scores for all images
all_images_boxes = list()
all_images_labels = list()
all_images_scores = list()
assert n_priors == predicted_locs.size(1) == predicted_scores.size(1)
for i in range(batch_size):
# Decode object coordinates from the form we regressed predicted boxes to
decoded_locs = cxcy_to_xy(
gcxgcy_to_cxcy(predicted_locs[i], self.priors_cxcy)) # (8732, 4), these are fractional pt. coordinates
# Lists to store boxes and scores for this image
image_boxes = list()
image_labels = list()
image_scores = list()
max_scores, best_label = predicted_scores[i].max(dim=1) # (8732)
# Check for each class
for c in range(1, self.n_classes):
# Keep only predicted boxes and scores where scores for this class are above the minimum score
class_scores = predicted_scores[i][:, c] # (8732)
score_above_min_score = class_scores > min_score # torch.uint8 (byte) tensor, for indexing
n_above_min_score = score_above_min_score.sum().item()
if n_above_min_score == 0:
continue
class_scores = class_scores[score_above_min_score] # (n_qualified), n_min_score <= 8732
class_decoded_locs = decoded_locs[score_above_min_score] # (n_qualified, 4)
# Sort predicted boxes and scores by scores
class_scores, sort_ind = class_scores.sort(dim=0, descending=True) # (n_qualified), (n_min_score)
class_decoded_locs = class_decoded_locs[sort_ind] # (n_min_score, 4)
# Find the overlap between predicted boxes
overlap = find_jaccard_overlap(class_decoded_locs, class_decoded_locs) # (n_qualified, n_min_score)
# Non-Maximum Suppression (NMS)
# A torch.uint8 (byte) tensor to keep track of which predicted boxes to suppress
# 1 implies suppress, 0 implies don't suppress
suppress = torch.zeros((n_above_min_score), dtype=torch.uint8).to(device) # (n_qualified)
# Consider each box in order of decreasing scores
for box in range(class_decoded_locs.size(0)):
# If this box is already marked for suppression
if suppress[box] == 1:
continue
# Suppress boxes whose overlaps (with this box) are greater than maximum overlap
# Find such boxes and update suppress indices
suppress = torch.max(suppress, overlap[box] > max_overlap)
# The max operation retains previously suppressed boxes, like an 'OR' operation
# Don't suppress this box, even though it has an overlap of 1 with itself
suppress[box] = 0
# Store only unsuppressed boxes for this class
image_boxes.append(class_decoded_locs[1 - suppress])
image_labels.append(torch.LongTensor((1 - suppress).sum().item() * [c]).to(device))
image_scores.append(class_scores[1 - suppress])
# If no object in any class is found, store a placeholder for 'background'
if len(image_boxes) == 0:
image_boxes.append(torch.FloatTensor([[0., 0., 1., 1.]]).to(device))
image_labels.append(torch.LongTensor([0]).to(device))
image_scores.append(torch.FloatTensor([0.]).to(device))
# Concatenate into single tensors
image_boxes = torch.cat(image_boxes, dim=0) # (n_objects, 4)
image_labels = torch.cat(image_labels, dim=0) # (n_objects)
image_scores = torch.cat(image_scores, dim=0) # (n_objects)
n_objects = image_scores.size(0)
# Keep only the top k objects
if n_objects > top_k:
image_scores, sort_ind = image_scores.sort(dim=0, descending=True)
image_scores = image_scores[:top_k] # (top_k)
image_boxes = image_boxes[sort_ind][:top_k] # (top_k, 4)
image_labels = image_labels[sort_ind][:top_k] # (top_k)
# Append to lists that store predicted boxes and scores for all images
all_images_boxes.append(image_boxes)
all_images_labels.append(image_labels)
all_images_scores.append(image_scores)
return all_images_boxes, all_images_labels, all_images_scores # lists of length batch_size
During training the output from the SSD forward pass is then sent to the criterion (set to this function) in order to calculate the loss.
class MultiBoxLoss(nn.Module):
"""
The MultiBox loss, a loss function for object detection.
This is a combination of:
(1) a localization loss for the predicted locations of the boxes, and
(2) a confidence loss for the predicted class scores.
"""
def __init__(self, priors_cxcy, threshold=0.5, neg_pos_ratio=3, alpha=1.):
super(MultiBoxLoss, self).__init__()
self.priors_cxcy = priors_cxcy
self.priors_xy = cxcy_to_xy(priors_cxcy)
self.threshold = threshold
self.neg_pos_ratio = neg_pos_ratio
self.alpha = alpha
self.smooth_l1 = nn.L1Loss()
self.cross_entropy = nn.CrossEntropyLoss(reduce=False)
def forward(self, predicted_locs, predicted_scores, boxes, labels):
"""
Forward propagation.
:param predicted_locs: predicted locations/boxes w.r.t the 8732 prior boxes, a tensor of dimensions (N, 8732, 4)
:param predicted_scores: class scores for each of the encoded locations/boxes, a tensor of dimensions (N, 8732, n_classes)
:param boxes: true object bounding boxes in boundary coordinates, a list of N tensors
:param labels: true object labels, a list of N tensors
:return: multibox loss, a scalar
"""
batch_size = predicted_locs.size(0)
n_priors = self.priors_cxcy.size(0)
n_classes = predicted_scores.size(2)
assert n_priors == predicted_locs.size(1) == predicted_scores.size(1)
true_locs = torch.zeros((batch_size, n_priors, 4), dtype=torch.float).to(device) # (N, 8732, 4)
true_classes = torch.zeros((batch_size, n_priors), dtype=torch.long).to(device) # (N, 8732)
# For each image
for i in range(batch_size):
# number of objects in this image
n_objects = boxes[i].size(0)
overlap = find_jaccard_overlap(boxes[i],
self.priors_xy) # (n_objects, 8732)
# For each prior, find the object that has the maximum overlap
overlap_for_each_prior, object_for_each_prior = overlap.max(dim=0) # (8732)
# We don't want a situation where an object is not represented in our positive (non-background) priors -
# 1. An object might not be the best object for all priors, and is therefore not in object_for_each_prior.
# 2. All priors with the object may be assigned as background based on the threshold (0.5).
# To remedy this -
# First, find the prior that has the maximum overlap for each object.
_, prior_for_each_object = overlap.max(dim=1) # (N_o)
# Then, assign each object to the corresponding maximum-overlap-prior. (This fixes 1.)
object_for_each_prior[prior_for_each_object] = torch.LongTensor(range(n_objects)).to(device)
# To ensure these priors qualify, artificially give them an overlap of greater than 0.5. (This fixes 2.)
overlap_for_each_prior[prior_for_each_object] = 1.
# Labels for each prior
label_for_each_prior = labels[i][object_for_each_prior] # (8732)
# Set priors whose overlaps with objects are less than the threshold to be background (no object)
label_for_each_prior[overlap_for_each_prior < self.threshold] = 0 # (8732)
# Store
true_classes[i] = label_for_each_prior
# Encode center-size object coordinates into the form we regressed predicted boxes to
true_locs[i] = cxcy_to_gcxgcy(xy_to_cxcy(boxes[i][object_for_each_prior]), self.priors_cxcy) # (8732, 4)
# Identify priors that are positive (object/non-background)
positive_priors = true_classes != 0 # (N, 8732)
# LOCALIZATION LOSS
# Localization loss is computed only over positive (non-background) priors
loc_loss = self.smooth_l1(predicted_locs[positive_priors], true_locs[positive_priors]) # (), scalar
# Note: indexing with a torch.uint8 (byte) tensor flattens the tensor when indexing is across multiple dimensions (N & 8732)
# So, if predicted_locs has the shape (N, 8732, 4), predicted_locs[positive_priors] will have (total positives, 4)
# CONFIDENCE LOSS
# Confidence loss is computed over positive priors and the most difficult (hardest) negative priors in each image
# That is, FOR EACH IMAGE,
# we will take the hardest (neg_pos_ratio * n_positives) negative priors, i.e where there is maximum loss
# This is called Hard Negative Mining - it concentrates on hardest negatives in each image, and also minimizes pos/neg imbalance
# Number of positive and hard-negative priors per image
n_positives = positive_priors.sum(dim=1) # (N)
n_hard_negatives = self.neg_pos_ratio * n_positives # (N)
# First, find the loss for all priors
conf_loss_all = self.cross_entropy(predicted_scores.view(-1, n_classes), true_classes.view(-1)) # (N * 8732)
conf_loss_all = conf_loss_all.view(batch_size, n_priors) # (N, 8732)
# We already know which priors are positive
conf_loss_pos = conf_loss_all[positive_priors] # (sum(n_positives))
# Next, find which priors are hard-negative
# To do this, sort ONLY negative priors in each image in order of decreasing loss and take top n_hard_negatives
conf_loss_neg = conf_loss_all.clone() # (N, 8732)
conf_loss_neg[positive_priors] = 0. # (N, 8732), positive priors are ignored (never in top n_hard_negatives)
conf_loss_neg, _ = conf_loss_neg.sort(dim=1, descending=True) # (N, 8732), sorted by decreasing hardness
hardness_ranks = torch.LongTensor(range(n_priors)).unsqueeze(0).expand_as(conf_loss_neg).to(device) # (N, 8732)
hard_negatives = hardness_ranks < n_hard_negatives.unsqueeze(1) # (N, 8732)
conf_loss_hard_neg = conf_loss_neg[hard_negatives] # (sum(n_hard_negatives))
# As in the paper, averaged over positive priors only, although computed over both positive and hard-negative priors
conf_loss = (conf_loss_hard_neg.sum() + conf_loss_pos.sum()) / n_positives.sum().float() # (), scalar
# TOTAL LOSS
return conf_loss + self.alpha * loc_loss
With the model implemented it is time to train. Should take 2 hours and 9 minutes for 10 epochs.
import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
from model import SSD300, MultiBoxLoss
from datasets import PascalVOCDataset
from utils import *
# TODO: Import a learning rate scheduler
from torch.optim.lr_scheduler import StepLR, CosineAnnealingLR, MultiStepLR
# Data parameters
data_folder = './' # folder with data files
keep_difficult = True # use objects considered difficult to detect?
# Model parameters
# Not too many here since the SSD300 has a very specific structure
n_classes = len(label_map) # number of different types of objects
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Learning parameters
checkpoint = None # path to model checkpoint, None if none
batch_size = 6 # batch size
iterations = 30000 # 120000 # number of iterations to train (DON'T CHANGE)
workers = 4 # number of workers for loading data in the DataLoader
print_freq = 200 # print training status every __ batches
momentum = 0.9 # momentum
weight_decay = 5e-4 # weight decay
grad_clip = None # clip if gradients are exploding, which may happen at larger batch sizes (sometimes at 32) - you will recognize it by a sorting error in the MuliBox loss calculation
cudnn.benchmark = True
# Overwrite the checkpoint function in utils
def save_checkpoint(epoch, model, optimizer, base_type, scheduler):
"""
Save model checkpoint.
:param epoch: epoch number
:param model: model
:param optimizer: optimizer
:param base_type: The base network type
"""
state = {'epoch': epoch,
'model': model,
'optimizer': optimizer,
'scheduler': scheduler}
if scheduler == None:
filename = 'checkpoint_ssd300_' + base_type + '.pth.tar'
else:
filename = 'checkpoint_ssd300_' + base_type + '_scheduler.pth.tar'
torch.save(state, filename)
def train_SSD(base_type, lr_type):
"""
Training.
"""
global start_epoch, label_map, epoch, checkpoint, decay_lr_at
# Custom dataloaders
train_dataset = PascalVOCDataset(data_folder,
split='train',
keep_difficult=keep_difficult)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
collate_fn=train_dataset.collate_fn, num_workers=workers,
pin_memory=True) # note that we're passing the collate function here
if lr_type == 'original_scheduler':
lr = 1e-3 # learning rate
decay_lr_at = [20000, 25000] # [80000, 100000] # decay learning rate after these many iterations
decay_lr_to = 0.1 # decay learning rate to this fraction of the existing learning rate
elif lr_type == 'pytorch_scheduler':
lr = 1e-3 # learning rate
else:
raise NotImplementedError
# Calculate total number of epochs to train and the epochs to decay learning rate at (i.e. convert iterations to epochs)
# To convert iterations to epochs, divide iterations by the number of iterations per epoch
# The original paper trains for 120,000 iterations with a batch size of 32, decays after 80,000 and 100,000 iterations
epochs = iterations // (len(train_dataset) // batch_size)
print("Number of Epochs to train:", epochs)
if lr_type == 'original_scheduler':
decay_lr_at = [it // (len(train_dataset) // batch_size) for it in decay_lr_at]
print("Epochs to decay learning rate:", decay_lr_at)
# Initialize model or load checkpoint
if checkpoint is None:
start_epoch = 0
model = SSD300(n_classes=n_classes, base_type=base_type)
# Initialize the optimizer, with twice the default learning rate for biases, as in the original Caffe repo
biases = list()
not_biases = list()
for param_name, param in model.named_parameters():
if param.requires_grad:
if param_name.endswith('.bias'):
biases.append(param)
else:
not_biases.append(param)
optimizer = torch.optim.SGD(params=[{'params': biases, 'lr': 2 * lr}, {'params': not_biases}],
lr=lr, momentum=momentum, weight_decay=weight_decay)
if lr_type == 'pytorch_scheduler':
# TODO: Create new scheduler
scheduler = MultiStepLR(optimizer, milestones=[7, 9], gamma=0.1, last_epoch=-1, verbose=False)
#scheduler = StepLR(optimizer, step_size=lr, gamma=0.1, last_epoch=-1, verbose=False)
else:
checkpoint = torch.load(checkpoint)
start_epoch = checkpoint['epoch'] + 1
print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
model = checkpoint['model']
optimizer = checkpoint['optimizer']
if lr_type == 'pytorch_scheduler':
# TODO: Load scheduler
# scheduler = checkpoint['scheduler']
scheduler = torch.load(scheduler)
# Move to default device
model = model.to(device)
criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)
# Epochs
for epoch in range(start_epoch, epochs):
# Decay learning rate at particular epochs
if lr_type == 'original_scheduler':
if epoch in decay_lr_at:
adjust_learning_rate(optimizer, decay_lr_to)
# One epoch's training
train(train_loader=train_loader,
model=model,
criterion=criterion,
optimizer=optimizer,
epoch=epoch)
# TODO: Update the learning rate
if lr_type == 'pytorch_scheduler':
scheduler.step()
# Save checkpoint
if lr_type == 'original_scheduler':
save_checkpoint(epoch, model, optimizer, base_type, scheduler=None)
else:
# TODO: Call save_checkpoint with your scheduler
save_checkpoint(epoch, model, optimizer, base_type, scheduler=scheduler)
def train(train_loader, model, criterion, optimizer, epoch):
"""
One epoch's training.
:param train_loader: DataLoader for training data
:param model: model
:param criterion: MultiBox loss
:param optimizer: optimizer
:param epoch: epoch number
"""
model.train() # training mode enables dropout
batch_time = AverageMeter() # forward prop. + back prop. time
data_time = AverageMeter() # data loading time
losses = AverageMeter() # loss
start = time.time()
# Batches
for i, (images, boxes, labels, _) in enumerate(train_loader):
data_time.update(time.time() - start)
# Move to default device
images = images.to(device) # (batch_size (N), 3, 300, 300)
boxes = [b.to(device) for b in boxes]
labels = [l.to(device) for l in labels]
# Forward prop.
predicted_locs, predicted_scores = model(images) # (N, 8732, 4), (N, 8732, n_classes)
# Loss
loss = criterion(predicted_locs, predicted_scores, boxes, labels) # scalar
# Backward prop.
optimizer.zero_grad()
loss.backward()
# Clip gradients, if necessary
if grad_clip is not None:
clip_gradient(optimizer, grad_clip)
# Update model
optimizer.step()
losses.update(loss.item(), images.size(0))
batch_time.update(time.time() - start)
start = time.time()
# Print status
if i % print_freq == 0:
print('Epoch: [{0}][{1}/{2}]\t'
'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(train_loader),
batch_time=batch_time,
data_time=data_time, loss=losses))
del predicted_locs, predicted_scores, images, boxes, labels # free some memory since their histories may be stored
This can be run without making any changes to the code.
start_time = time.time()
train_SSD(base_type='VGG', lr_type='original_scheduler')
end_time = time.time()
print("time elapsed:", end_time - start_time)
Number of Epochs to train: 10 Epochs to decay learning rate: [7, 9] Loaded base model.
/usr/local/lib/python3.6/dist-packages/torch/nn/_reduction.py:44: UserWarning: size_average and reduce args will be deprecated, please use reduction='none' instead. warnings.warn(warning.format(ret))
Epoch: [0][0/2759] Batch Time 1.774 (1.774) Data Time 1.404 (1.404) Loss 23.3955 (23.3955) Epoch: [0][200/2759] Batch Time 0.287 (0.286) Data Time 0.000 (0.008) Loss 8.2036 (11.0180) Epoch: [0][400/2759] Batch Time 0.286 (0.280) Data Time 0.000 (0.004) Loss 6.7961 (8.8175) Epoch: [0][600/2759] Batch Time 0.261 (0.279) Data Time 0.000 (0.003) Loss 5.6785 (7.9680) Epoch: [0][800/2759] Batch Time 0.288 (0.279) Data Time 0.000 (0.003) Loss 6.2981 (7.5076) Epoch: [0][1000/2759] Batch Time 0.257 (0.278) Data Time 0.000 (0.002) Loss 5.4247 (7.2240) Epoch: [0][1200/2759] Batch Time 0.277 (0.278) Data Time 0.000 (0.002) Loss 5.9191 (7.0280) Epoch: [0][1400/2759] Batch Time 0.295 (0.278) Data Time 0.000 (0.002) Loss 5.1915 (6.8588) Epoch: [0][1600/2759] Batch Time 0.267 (0.278) Data Time 0.000 (0.002) Loss 5.0079 (6.7185) Epoch: [0][1800/2759] Batch Time 0.270 (0.277) Data Time 0.000 (0.002) Loss 5.8228 (6.6014) Epoch: [0][2000/2759] Batch Time 0.253 (0.277) Data Time 0.000 (0.002) Loss 4.7524 (6.4970) Epoch: [0][2200/2759] Batch Time 0.269 (0.277) Data Time 0.003 (0.002) Loss 4.9170 (6.4109) Epoch: [0][2400/2759] Batch Time 0.282 (0.277) Data Time 0.000 (0.002) Loss 5.3106 (6.3256) Epoch: [0][2600/2759] Batch Time 0.286 (0.277) Data Time 0.005 (0.002) Loss 4.8807 (6.2475) Epoch: [1][0/2759] Batch Time 1.473 (1.473) Data Time 1.112 (1.112) Loss 4.9841 (4.9841) Epoch: [1][200/2759] Batch Time 0.274 (0.284) Data Time 0.000 (0.007) Loss 5.3943 (5.1638) Epoch: [1][400/2759] Batch Time 0.267 (0.282) Data Time 0.000 (0.004) Loss 4.9127 (5.1548) Epoch: [1][600/2759] Batch Time 0.303 (0.280) Data Time 0.000 (0.003) Loss 3.5322 (5.0990) Epoch: [1][800/2759] Batch Time 0.263 (0.280) Data Time 0.000 (0.003) Loss 5.2833 (5.0659) Epoch: [1][1000/2759] Batch Time 0.286 (0.280) Data Time 0.000 (0.003) Loss 5.2642 (5.0181) Epoch: [1][1200/2759] Batch Time 0.268 (0.280) Data Time 0.000 (0.003) Loss 4.7500 (4.9827) Epoch: [1][1400/2759] Batch Time 0.269 (0.280) Data Time 0.009 (0.002) Loss 5.0729 (4.9511) Epoch: [1][1600/2759] Batch Time 0.266 (0.280) Data Time 0.000 (0.002) Loss 3.5885 (4.9219) Epoch: [1][1800/2759] Batch Time 0.298 (0.279) Data Time 0.000 (0.002) Loss 3.9362 (4.8984) Epoch: [1][2000/2759] Batch Time 0.278 (0.279) Data Time 0.010 (0.002) Loss 4.1185 (4.8742) Epoch: [1][2200/2759] Batch Time 0.273 (0.279) Data Time 0.000 (0.002) Loss 5.0521 (4.8470) Epoch: [1][2400/2759] Batch Time 0.283 (0.279) Data Time 0.000 (0.002) Loss 4.5573 (4.8211) Epoch: [1][2600/2759] Batch Time 0.264 (0.279) Data Time 0.000 (0.002) Loss 3.7353 (4.7957) Epoch: [2][0/2759] Batch Time 1.515 (1.515) Data Time 1.135 (1.135) Loss 3.6839 (3.6839) Epoch: [2][200/2759] Batch Time 0.286 (0.289) Data Time 0.000 (0.008) Loss 4.8827 (4.4722) Epoch: [2][400/2759] Batch Time 0.285 (0.283) Data Time 0.000 (0.004) Loss 4.1496 (4.3968) Epoch: [2][600/2759] Batch Time 0.267 (0.280) Data Time 0.000 (0.003) Loss 4.3043 (4.3771) Epoch: [2][800/2759] Batch Time 0.293 (0.279) Data Time 0.000 (0.003) Loss 4.0316 (4.3620) Epoch: [2][1000/2759] Batch Time 0.277 (0.279) Data Time 0.000 (0.002) Loss 5.1344 (4.3516) Epoch: [2][1200/2759] Batch Time 0.304 (0.278) Data Time 0.000 (0.002) Loss 3.9433 (4.3327) Epoch: [2][1400/2759] Batch Time 0.299 (0.278) Data Time 0.005 (0.002) Loss 3.1659 (4.3131) Epoch: [2][1600/2759] Batch Time 0.262 (0.277) Data Time 0.005 (0.002) Loss 3.4038 (4.3025) Epoch: [2][1800/2759] Batch Time 0.288 (0.278) Data Time 0.000 (0.002) Loss 4.4169 (4.2829) Epoch: [2][2000/2759] Batch Time 0.286 (0.278) Data Time 0.000 (0.002) Loss 4.6821 (4.2660) Epoch: [2][2200/2759] Batch Time 0.262 (0.277) Data Time 0.000 (0.002) Loss 5.7007 (4.2552) Epoch: [2][2400/2759] Batch Time 0.280 (0.277) Data Time 0.009 (0.002) Loss 4.4181 (4.2395) Epoch: [2][2600/2759] Batch Time 0.290 (0.278) Data Time 0.000 (0.002) Loss 4.1760 (4.2312) Epoch: [3][0/2759] Batch Time 1.173 (1.173) Data Time 0.818 (0.818) Loss 4.0496 (4.0496) Epoch: [3][200/2759] Batch Time 0.269 (0.284) Data Time 0.000 (0.010) Loss 3.4590 (4.0149) Epoch: [3][400/2759] Batch Time 0.268 (0.277) Data Time 0.000 (0.005) Loss 4.3305 (4.0038) Epoch: [3][600/2759] Batch Time 0.253 (0.276) Data Time 0.000 (0.004) Loss 2.9417 (3.9863) Epoch: [3][800/2759] Batch Time 0.257 (0.275) Data Time 0.000 (0.003) Loss 3.5445 (3.9974) Epoch: [3][1000/2759] Batch Time 0.263 (0.275) Data Time 0.000 (0.003) Loss 3.9597 (3.9915) Epoch: [3][1200/2759] Batch Time 0.270 (0.275) Data Time 0.000 (0.002) Loss 4.2115 (3.9826) Epoch: [3][1400/2759] Batch Time 0.260 (0.274) Data Time 0.000 (0.002) Loss 3.7368 (3.9685) Epoch: [3][1600/2759] Batch Time 0.273 (0.274) Data Time 0.000 (0.002) Loss 3.7478 (3.9543) Epoch: [3][1800/2759] Batch Time 0.270 (0.274) Data Time 0.003 (0.002) Loss 4.6301 (3.9394) Epoch: [3][2000/2759] Batch Time 0.270 (0.274) Data Time 0.000 (0.002) Loss 4.0360 (3.9327) Epoch: [3][2200/2759] Batch Time 0.266 (0.274) Data Time 0.000 (0.002) Loss 3.9757 (3.9204) Epoch: [3][2400/2759] Batch Time 0.278 (0.274) Data Time 0.000 (0.002) Loss 3.9905 (3.9156) Epoch: [3][2600/2759] Batch Time 0.274 (0.274) Data Time 0.000 (0.002) Loss 3.1485 (3.9088) Epoch: [4][0/2759] Batch Time 1.394 (1.394) Data Time 1.050 (1.050) Loss 3.7799 (3.7799) Epoch: [4][200/2759] Batch Time 0.266 (0.285) Data Time 0.000 (0.009) Loss 3.9814 (3.7491) Epoch: [4][400/2759] Batch Time 0.286 (0.280) Data Time 0.000 (0.005) Loss 3.8096 (3.7265) Epoch: [4][600/2759] Batch Time 0.271 (0.278) Data Time 0.000 (0.004) Loss 3.4707 (3.7523) Epoch: [4][800/2759] Batch Time 0.276 (0.277) Data Time 0.000 (0.003) Loss 4.0862 (3.7582) Epoch: [4][1000/2759] Batch Time 0.285 (0.277) Data Time 0.000 (0.002) Loss 3.7611 (3.7550) Epoch: [4][1200/2759] Batch Time 0.267 (0.276) Data Time 0.010 (0.002) Loss 3.4306 (3.7438) Epoch: [4][1400/2759] Batch Time 0.261 (0.277) Data Time 0.000 (0.002) Loss 3.5766 (3.7409) Epoch: [4][1600/2759] Batch Time 0.265 (0.276) Data Time 0.000 (0.002) Loss 3.9978 (3.7362) Epoch: [4][1800/2759] Batch Time 0.259 (0.276) Data Time 0.000 (0.002) Loss 3.7056 (3.7435) Epoch: [4][2000/2759] Batch Time 0.259 (0.276) Data Time 0.000 (0.002) Loss 3.3793 (3.7414) Epoch: [4][2200/2759] Batch Time 0.278 (0.276) Data Time 0.000 (0.002) Loss 3.3298 (3.7353) Epoch: [4][2400/2759] Batch Time 0.257 (0.276) Data Time 0.000 (0.001) Loss 3.1476 (3.7340) Epoch: [4][2600/2759] Batch Time 0.277 (0.276) Data Time 0.000 (0.001) Loss 3.2075 (3.7347) Epoch: [5][0/2759] Batch Time 1.838 (1.838) Data Time 1.462 (1.462) Loss 3.7509 (3.7509) Epoch: [5][200/2759] Batch Time 0.279 (0.286) Data Time 0.000 (0.009) Loss 3.2670 (3.6523) Epoch: [5][400/2759] Batch Time 0.253 (0.280) Data Time 0.000 (0.005) Loss 4.1438 (3.6323) Epoch: [5][600/2759] Batch Time 0.273 (0.278) Data Time 0.000 (0.004) Loss 3.5138 (3.6178) Epoch: [5][800/2759] Batch Time 0.286 (0.278) Data Time 0.000 (0.003) Loss 2.8436 (3.6271) Epoch: [5][1000/2759] Batch Time 0.268 (0.277) Data Time 0.000 (0.002) Loss 4.6562 (3.6314) Epoch: [5][1200/2759] Batch Time 0.281 (0.277) Data Time 0.000 (0.002) Loss 3.5419 (3.6313) Epoch: [5][1400/2759] Batch Time 0.305 (0.277) Data Time 0.002 (0.002) Loss 4.1764 (3.6219) Epoch: [5][1600/2759] Batch Time 0.280 (0.277) Data Time 0.000 (0.002) Loss 3.2758 (3.6166) Epoch: [5][1800/2759] Batch Time 0.298 (0.277) Data Time 0.000 (0.002) Loss 3.8401 (3.6108) Epoch: [5][2000/2759] Batch Time 0.260 (0.277) Data Time 0.004 (0.002) Loss 3.3088 (3.6099) Epoch: [5][2200/2759] Batch Time 0.258 (0.277) Data Time 0.000 (0.002) Loss 3.8449 (3.6023) Epoch: [5][2400/2759] Batch Time 0.266 (0.277) Data Time 0.000 (0.002) Loss 3.0408 (3.5969) Epoch: [5][2600/2759] Batch Time 0.284 (0.277) Data Time 0.000 (0.002) Loss 3.8718 (3.5951) Epoch: [6][0/2759] Batch Time 1.215 (1.215) Data Time 0.857 (0.857) Loss 3.6201 (3.6201) Epoch: [6][200/2759] Batch Time 0.265 (0.280) Data Time 0.000 (0.006) Loss 2.7247 (3.4922) Epoch: [6][400/2759] Batch Time 0.277 (0.279) Data Time 0.000 (0.004) Loss 3.7861 (3.4926) Epoch: [6][600/2759] Batch Time 0.273 (0.278) Data Time 0.000 (0.003) Loss 3.5959 (3.5110) Epoch: [6][800/2759] Batch Time 0.274 (0.277) Data Time 0.000 (0.002) Loss 3.6164 (3.5018) Epoch: [6][1000/2759] Batch Time 0.271 (0.277) Data Time 0.000 (0.002) Loss 4.0391 (3.5025) Epoch: [6][1200/2759] Batch Time 0.260 (0.276) Data Time 0.000 (0.002) Loss 2.8241 (3.4965) Epoch: [6][1400/2759] Batch Time 0.261 (0.276) Data Time 0.000 (0.002) Loss 3.2815 (3.5028) Epoch: [6][1600/2759] Batch Time 0.289 (0.276) Data Time 0.000 (0.002) Loss 4.0380 (3.5005) Epoch: [6][1800/2759] Batch Time 0.291 (0.276) Data Time 0.000 (0.001) Loss 2.3567 (3.4962) Epoch: [6][2000/2759] Batch Time 0.278 (0.276) Data Time 0.004 (0.001) Loss 4.3124 (3.4921) Epoch: [6][2200/2759] Batch Time 0.267 (0.276) Data Time 0.000 (0.001) Loss 2.4620 (3.4855) Epoch: [6][2400/2759] Batch Time 0.275 (0.276) Data Time 0.000 (0.001) Loss 2.7452 (3.4784) Epoch: [6][2600/2759] Batch Time 0.276 (0.275) Data Time 0.000 (0.001) Loss 3.6782 (3.4767) DECAYING learning rate. The new LR is 0.000100 Epoch: [7][0/2759] Batch Time 1.176 (1.176) Data Time 0.827 (0.827) Loss 3.7708 (3.7708) Epoch: [7][200/2759] Batch Time 0.269 (0.282) Data Time 0.000 (0.006) Loss 3.3783 (3.3019) Epoch: [7][400/2759] Batch Time 0.256 (0.278) Data Time 0.000 (0.003) Loss 4.1004 (3.2421) Epoch: [7][600/2759] Batch Time 0.260 (0.276) Data Time 0.000 (0.003) Loss 2.5754 (3.2073) Epoch: [7][800/2759] Batch Time 0.273 (0.276) Data Time 0.000 (0.002) Loss 3.0700 (3.2027) Epoch: [7][1000/2759] Batch Time 0.291 (0.275) Data Time 0.000 (0.002) Loss 2.9719 (3.1771) Epoch: [7][1200/2759] Batch Time 0.268 (0.275) Data Time 0.000 (0.002) Loss 2.4282 (3.1618) Epoch: [7][1400/2759] Batch Time 0.291 (0.275) Data Time 0.000 (0.002) Loss 3.6032 (3.1496) Epoch: [7][1600/2759] Batch Time 0.259 (0.275) Data Time 0.000 (0.002) Loss 2.3415 (3.1435) Epoch: [7][1800/2759] Batch Time 0.282 (0.275) Data Time 0.000 (0.001) Loss 2.6160 (3.1406) Epoch: [7][2000/2759] Batch Time 0.285 (0.275) Data Time 0.000 (0.001) Loss 2.9954 (3.1359) Epoch: [7][2200/2759] Batch Time 0.273 (0.275) Data Time 0.000 (0.001) Loss 3.1210 (3.1373) Epoch: [7][2400/2759] Batch Time 0.289 (0.275) Data Time 0.000 (0.001) Loss 2.6915 (3.1353) Epoch: [7][2600/2759] Batch Time 0.276 (0.275) Data Time 0.000 (0.001) Loss 3.7422 (3.1388) Epoch: [8][0/2759] Batch Time 1.393 (1.393) Data Time 1.054 (1.054) Loss 2.6536 (2.6536) Epoch: [8][200/2759] Batch Time 0.261 (0.284) Data Time 0.000 (0.007) Loss 2.8003 (3.0984) Epoch: [8][400/2759] Batch Time 0.294 (0.280) Data Time 0.003 (0.004) Loss 3.3204 (3.1038) Epoch: [8][600/2759] Batch Time 0.255 (0.277) Data Time 0.000 (0.003) Loss 3.4132 (3.0977) Epoch: [8][800/2759] Batch Time 0.275 (0.277) Data Time 0.000 (0.002) Loss 3.1088 (3.0719) Epoch: [8][1000/2759] Batch Time 0.284 (0.277) Data Time 0.000 (0.002) Loss 3.3024 (3.0853) Epoch: [8][1200/2759] Batch Time 0.288 (0.276) Data Time 0.000 (0.002) Loss 2.8562 (3.0880) Epoch: [8][1400/2759] Batch Time 0.287 (0.277) Data Time 0.000 (0.002) Loss 3.3643 (3.0771) Epoch: [8][1600/2759] Batch Time 0.271 (0.276) Data Time 0.000 (0.002) Loss 3.6093 (3.0788) Epoch: [8][1800/2759] Batch Time 0.270 (0.276) Data Time 0.000 (0.002) Loss 3.3386 (3.0771) Epoch: [8][2000/2759] Batch Time 0.277 (0.276) Data Time 0.000 (0.001) Loss 3.0603 (3.0694) Epoch: [8][2200/2759] Batch Time 0.273 (0.276) Data Time 0.000 (0.001) Loss 3.5251 (3.0674) Epoch: [8][2400/2759] Batch Time 0.268 (0.276) Data Time 0.000 (0.001) Loss 2.7012 (3.0635) Epoch: [8][2600/2759] Batch Time 0.276 (0.276) Data Time 0.000 (0.001) Loss 3.0878 (3.0610) DECAYING learning rate. The new LR is 0.000010 Epoch: [9][0/2759] Batch Time 1.392 (1.392) Data Time 1.048 (1.048) Loss 2.6528 (2.6528) Epoch: [9][200/2759] Batch Time 0.264 (0.282) Data Time 0.000 (0.008) Loss 3.7740 (3.0958) Epoch: [9][400/2759] Batch Time 0.274 (0.278) Data Time 0.000 (0.004) Loss 3.5090 (3.0420) Epoch: [9][600/2759] Batch Time 0.295 (0.278) Data Time 0.000 (0.003) Loss 3.6395 (3.0347) Epoch: [9][800/2759] Batch Time 0.298 (0.277) Data Time 0.007 (0.003) Loss 2.1627 (3.0256) Epoch: [9][1000/2759] Batch Time 0.282 (0.277) Data Time 0.000 (0.002) Loss 3.0786 (3.0347) Epoch: [9][1200/2759] Batch Time 0.271 (0.276) Data Time 0.000 (0.002) Loss 2.0694 (3.0256) Epoch: [9][1400/2759] Batch Time 0.279 (0.276) Data Time 0.000 (0.002) Loss 2.3670 (3.0311) Epoch: [9][1600/2759] Batch Time 0.297 (0.276) Data Time 0.000 (0.002) Loss 2.7239 (3.0314) Epoch: [9][1800/2759] Batch Time 0.271 (0.276) Data Time 0.000 (0.002) Loss 2.6744 (3.0314) Epoch: [9][2000/2759] Batch Time 0.288 (0.275) Data Time 0.000 (0.002) Loss 3.0921 (3.0284) Epoch: [9][2200/2759] Batch Time 0.295 (0.275) Data Time 0.005 (0.002) Loss 3.1735 (3.0271) Epoch: [9][2400/2759] Batch Time 0.263 (0.275) Data Time 0.000 (0.001) Loss 2.8854 (3.0225) Epoch: [9][2600/2759] Batch Time 0.276 (0.275) Data Time 0.000 (0.001) Loss 3.1117 (3.0240) time elapsed: 7627.683887720108
This should be run after implementing the ResNet Base.
start_time = time.time()
train_SSD(base_type='ResNet', lr_type='original_scheduler')
end_time = time.time()
print("time elapsed:", end_time - start_time)
Number of Epochs to train: 10 Epochs to decay learning rate: [7, 9]
Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /root/.cache/torch/hub/checkpoints/resnet34-333f7ec4.pth
/usr/local/lib/python3.7/dist-packages/torch/nn/_reduction.py:44: UserWarning: size_average and reduce args will be deprecated, please use reduction='none' instead. warnings.warn(warning.format(ret))
Epoch: [0][0/2759] Batch Time 105.858 (105.858) Data Time 104.393 (104.393) Loss 21.8664 (21.8664) Epoch: [0][200/2759] Batch Time 1.164 (0.967) Data Time 1.042 (0.829) Loss 6.6998 (9.1211) Epoch: [0][400/2759] Batch Time 1.107 (0.704) Data Time 0.984 (0.568) Loss 6.4495 (7.9832) Epoch: [0][600/2759] Batch Time 0.261 (0.610) Data Time 0.144 (0.475) Loss 6.2675 (7.5460) Epoch: [0][800/2759] Batch Time 0.129 (0.562) Data Time 0.000 (0.428) Loss 7.2126 (7.3045) Epoch: [0][1000/2759] Batch Time 0.149 (0.539) Data Time 0.000 (0.404) Loss 6.8299 (7.1662) Epoch: [0][1200/2759] Batch Time 0.130 (0.521) Data Time 0.000 (0.387) Loss 6.7394 (7.0444) Epoch: [0][1400/2759] Batch Time 0.142 (0.508) Data Time 0.001 (0.374) Loss 7.0547 (6.9392) Epoch: [0][1600/2759] Batch Time 1.219 (0.500) Data Time 1.099 (0.367) Loss 6.8244 (6.8506) Epoch: [0][1800/2759] Batch Time 0.127 (0.494) Data Time 0.000 (0.361) Loss 6.0426 (6.7643) Epoch: [0][2000/2759] Batch Time 0.143 (0.488) Data Time 0.000 (0.355) Loss 5.9645 (6.6970) Epoch: [0][2200/2759] Batch Time 0.138 (0.483) Data Time 0.000 (0.350) Loss 5.6714 (6.6356) Epoch: [0][2400/2759] Batch Time 0.249 (0.479) Data Time 0.114 (0.346) Loss 5.3855 (6.5670) Epoch: [0][2600/2759] Batch Time 1.463 (0.475) Data Time 1.335 (0.343) Loss 5.7102 (6.5032) Epoch: [1][0/2759] Batch Time 1.541 (1.541) Data Time 1.230 (1.230) Loss 5.9436 (5.9436) Epoch: [1][200/2759] Batch Time 0.185 (0.234) Data Time 0.000 (0.054) Loss 5.5904 (5.5152) Epoch: [1][400/2759] Batch Time 0.410 (0.227) Data Time 0.262 (0.051) Loss 6.2004 (5.4884) Epoch: [1][600/2759] Batch Time 0.152 (0.227) Data Time 0.000 (0.051) Loss 4.8451 (5.5071) Epoch: [1][800/2759] Batch Time 0.164 (0.226) Data Time 0.000 (0.050) Loss 5.0963 (5.4856) Epoch: [1][1000/2759] Batch Time 0.233 (0.226) Data Time 0.000 (0.050) Loss 3.8400 (5.4478) Epoch: [1][1200/2759] Batch Time 0.500 (0.227) Data Time 0.345 (0.052) Loss 4.3515 (5.4091) Epoch: [1][1400/2759] Batch Time 0.177 (0.227) Data Time 0.000 (0.052) Loss 6.5441 (5.3921) Epoch: [1][1600/2759] Batch Time 0.234 (0.227) Data Time 0.000 (0.052) Loss 5.9475 (5.3650) Epoch: [1][1800/2759] Batch Time 0.153 (0.227) Data Time 0.000 (0.053) Loss 4.9269 (5.3673) Epoch: [1][2000/2759] Batch Time 0.144 (0.227) Data Time 0.000 (0.053) Loss 5.1750 (5.3485) Epoch: [1][2200/2759] Batch Time 0.156 (0.226) Data Time 0.000 (0.052) Loss 6.2647 (5.3126) Epoch: [1][2400/2759] Batch Time 0.177 (0.226) Data Time 0.000 (0.052) Loss 6.3991 (5.2803) Epoch: [1][2600/2759] Batch Time 0.614 (0.226) Data Time 0.419 (0.052) Loss 5.8678 (5.2590) Epoch: [2][0/2759] Batch Time 1.106 (1.106) Data Time 0.853 (0.853) Loss 4.0198 (4.0198) Epoch: [2][200/2759] Batch Time 0.150 (0.229) Data Time 0.000 (0.056) Loss 3.3443 (4.8810) Epoch: [2][400/2759] Batch Time 0.578 (0.223) Data Time 0.408 (0.052) Loss 6.2361 (4.8413) Epoch: [2][600/2759] Batch Time 0.180 (0.221) Data Time 0.000 (0.050) Loss 5.1517 (4.8115) Epoch: [2][800/2759] Batch Time 0.344 (0.223) Data Time 0.217 (0.052) Loss 5.8167 (4.8261) Epoch: [2][1000/2759] Batch Time 0.154 (0.224) Data Time 0.000 (0.053) Loss 5.3525 (4.8354) Epoch: [2][1200/2759] Batch Time 0.170 (0.224) Data Time 0.000 (0.053) Loss 3.8204 (4.8140) Epoch: [2][1400/2759] Batch Time 0.286 (0.224) Data Time 0.077 (0.053) Loss 4.6945 (4.8038) Epoch: [2][1600/2759] Batch Time 0.489 (0.225) Data Time 0.298 (0.054) Loss 5.2595 (4.7922) Epoch: [2][1800/2759] Batch Time 0.164 (0.224) Data Time 0.005 (0.053) Loss 3.1891 (4.7713) Epoch: [2][2000/2759] Batch Time 0.162 (0.225) Data Time 0.000 (0.053) Loss 5.4595 (4.7451) Epoch: [2][2200/2759] Batch Time 0.427 (0.225) Data Time 0.292 (0.053) Loss 4.6972 (4.7317) Epoch: [2][2400/2759] Batch Time 0.271 (0.225) Data Time 0.127 (0.053) Loss 4.7779 (4.7232) Epoch: [2][2600/2759] Batch Time 0.187 (0.226) Data Time 0.000 (0.054) Loss 5.5085 (4.7122) Epoch: [3][0/2759] Batch Time 1.355 (1.355) Data Time 1.038 (1.038) Loss 4.7946 (4.7946) Epoch: [3][200/2759] Batch Time 0.167 (0.242) Data Time 0.000 (0.066) Loss 5.2232 (4.5939) Epoch: [3][400/2759] Batch Time 0.177 (0.235) Data Time 0.000 (0.061) Loss 4.1215 (4.5610) Epoch: [3][600/2759] Batch Time 0.162 (0.232) Data Time 0.000 (0.058) Loss 4.9820 (4.5343) Epoch: [3][800/2759] Batch Time 0.164 (0.230) Data Time 0.000 (0.057) Loss 3.1685 (4.5549) Epoch: [3][1000/2759] Batch Time 0.167 (0.230) Data Time 0.000 (0.057) Loss 3.7950 (4.5143) Epoch: [3][1200/2759] Batch Time 0.172 (0.229) Data Time 0.000 (0.056) Loss 5.0407 (4.4891) Epoch: [3][1400/2759] Batch Time 0.193 (0.229) Data Time 0.000 (0.055) Loss 5.0871 (4.4708) Epoch: [3][1600/2759] Batch Time 0.208 (0.228) Data Time 0.000 (0.055) Loss 3.1162 (4.4506) Epoch: [3][1800/2759] Batch Time 0.217 (0.228) Data Time 0.001 (0.054) Loss 3.6395 (4.4651) Epoch: [3][2000/2759] Batch Time 0.204 (0.228) Data Time 0.000 (0.054) Loss 4.8684 (4.4786) Epoch: [3][2200/2759] Batch Time 0.186 (0.228) Data Time 0.005 (0.054) Loss 3.9491 (4.4677) Epoch: [3][2400/2759] Batch Time 0.165 (0.227) Data Time 0.005 (0.053) Loss 5.5325 (4.4484) Epoch: [3][2600/2759] Batch Time 0.333 (0.227) Data Time 0.158 (0.053) Loss 3.7070 (4.4396) Epoch: [4][0/2759] Batch Time 1.818 (1.818) Data Time 1.583 (1.583) Loss 4.2591 (4.2591) Epoch: [4][200/2759] Batch Time 0.483 (0.238) Data Time 0.327 (0.060) Loss 4.3703 (4.3040) Epoch: [4][400/2759] Batch Time 0.161 (0.230) Data Time 0.005 (0.055) Loss 4.3832 (4.2305) Epoch: [4][600/2759] Batch Time 0.165 (0.229) Data Time 0.000 (0.054) Loss 4.6354 (4.1975) Epoch: [4][800/2759] Batch Time 0.176 (0.229) Data Time 0.012 (0.054) Loss 4.1909 (4.2124) Epoch: [4][1000/2759] Batch Time 0.431 (0.229) Data Time 0.262 (0.054) Loss 4.8415 (4.2088) Epoch: [4][1200/2759] Batch Time 0.186 (0.228) Data Time 0.000 (0.054) Loss 3.3298 (4.2043) Epoch: [4][1400/2759] Batch Time 0.620 (0.228) Data Time 0.484 (0.054) Loss 4.5607 (4.2093) Epoch: [4][1600/2759] Batch Time 0.317 (0.228) Data Time 0.112 (0.054) Loss 3.0423 (4.1993) Epoch: [4][1800/2759] Batch Time 0.173 (0.228) Data Time 0.000 (0.054) Loss 4.9946 (4.2015) Epoch: [4][2000/2759] Batch Time 0.162 (0.227) Data Time 0.000 (0.054) Loss 3.8958 (4.2013) Epoch: [4][2200/2759] Batch Time 0.157 (0.227) Data Time 0.000 (0.053) Loss 4.4523 (4.1908) Epoch: [4][2400/2759] Batch Time 0.404 (0.227) Data Time 0.255 (0.053) Loss 3.8775 (4.1837) Epoch: [4][2600/2759] Batch Time 0.194 (0.227) Data Time 0.000 (0.053) Loss 5.2359 (4.1786) Epoch: [5][0/2759] Batch Time 0.899 (0.899) Data Time 0.664 (0.664) Loss 4.2578 (4.2578) Epoch: [5][200/2759] Batch Time 0.438 (0.236) Data Time 0.279 (0.058) Loss 4.0484 (4.0891) Epoch: [5][400/2759] Batch Time 0.393 (0.229) Data Time 0.237 (0.051) Loss 3.4120 (4.0432) Epoch: [5][600/2759] Batch Time 0.522 (0.230) Data Time 0.357 (0.055) Loss 5.0649 (4.0559) Epoch: [5][800/2759] Batch Time 0.443 (0.230) Data Time 0.294 (0.056) Loss 4.3885 (4.0648) Epoch: [5][1000/2759] Batch Time 0.170 (0.229) Data Time 0.006 (0.056) Loss 2.4369 (4.0644) Epoch: [5][1200/2759] Batch Time 0.171 (0.230) Data Time 0.005 (0.056) Loss 4.2253 (4.0551) Epoch: [5][1400/2759] Batch Time 0.181 (0.230) Data Time 0.000 (0.056) Loss 3.5875 (4.0547) Epoch: [5][1600/2759] Batch Time 0.243 (0.229) Data Time 0.076 (0.055) Loss 4.0493 (4.0508) Epoch: [5][1800/2759] Batch Time 0.149 (0.229) Data Time 0.000 (0.054) Loss 3.8493 (4.0371) Epoch: [5][2000/2759] Batch Time 0.738 (0.229) Data Time 0.574 (0.055) Loss 3.9566 (4.0370) Epoch: [5][2200/2759] Batch Time 0.171 (0.229) Data Time 0.000 (0.056) Loss 4.7293 (4.0284) Epoch: [5][2400/2759] Batch Time 0.378 (0.228) Data Time 0.238 (0.055) Loss 3.4849 (4.0220) Epoch: [5][2600/2759] Batch Time 0.224 (0.228) Data Time 0.005 (0.055) Loss 2.7130 (4.0132) Epoch: [6][0/2759] Batch Time 1.311 (1.311) Data Time 1.037 (1.037) Loss 4.8456 (4.8456) Epoch: [6][200/2759] Batch Time 0.212 (0.234) Data Time 0.057 (0.060) Loss 4.8595 (3.9700) Epoch: [6][400/2759] Batch Time 0.237 (0.232) Data Time 0.014 (0.059) Loss 4.3131 (4.0023) Epoch: [6][600/2759] Batch Time 0.193 (0.230) Data Time 0.004 (0.057) Loss 4.2408 (3.9720) Epoch: [6][800/2759] Batch Time 0.378 (0.228) Data Time 0.209 (0.054) Loss 4.4256 (3.9841) Epoch: [6][1000/2759] Batch Time 0.154 (0.228) Data Time 0.000 (0.054) Loss 4.8886 (3.9602) Epoch: [6][1200/2759] Batch Time 0.157 (0.227) Data Time 0.000 (0.053) Loss 4.1726 (3.9403) Epoch: [6][1400/2759] Batch Time 0.409 (0.227) Data Time 0.229 (0.053) Loss 3.6542 (3.9366) Epoch: [6][1600/2759] Batch Time 0.166 (0.228) Data Time 0.000 (0.054) Loss 3.9682 (3.9173) Epoch: [6][1800/2759] Batch Time 0.215 (0.227) Data Time 0.000 (0.054) Loss 3.9990 (3.9258) Epoch: [6][2000/2759] Batch Time 0.208 (0.227) Data Time 0.047 (0.053) Loss 3.2690 (3.9200) Epoch: [6][2200/2759] Batch Time 0.363 (0.227) Data Time 0.203 (0.054) Loss 4.2684 (3.9230) Epoch: [6][2400/2759] Batch Time 0.519 (0.228) Data Time 0.390 (0.054) Loss 3.0756 (3.9219) Epoch: [6][2600/2759] Batch Time 0.178 (0.227) Data Time 0.005 (0.054) Loss 3.7606 (3.9175) DECAYING learning rate. The new LR is 0.000100 Epoch: [7][0/2759] Batch Time 1.614 (1.614) Data Time 1.365 (1.365) Loss 4.7539 (4.7539) Epoch: [7][200/2759] Batch Time 0.157 (0.244) Data Time 0.003 (0.068) Loss 3.1443 (3.5939) Epoch: [7][400/2759] Batch Time 0.223 (0.235) Data Time 0.000 (0.059) Loss 3.8417 (3.5253) Epoch: [7][600/2759] Batch Time 0.191 (0.231) Data Time 0.049 (0.058) Loss 3.5827 (3.4885) Epoch: [7][800/2759] Batch Time 0.666 (0.230) Data Time 0.523 (0.057) Loss 3.7650 (3.4546) Epoch: [7][1000/2759] Batch Time 0.165 (0.230) Data Time 0.000 (0.057) Loss 3.5643 (3.4377) Epoch: [7][1200/2759] Batch Time 0.176 (0.229) Data Time 0.010 (0.056) Loss 4.0627 (3.4237) Epoch: [7][1400/2759] Batch Time 0.156 (0.229) Data Time 0.000 (0.055) Loss 3.6544 (3.4110) Epoch: [7][1600/2759] Batch Time 0.227 (0.229) Data Time 0.056 (0.056) Loss 3.7142 (3.4054) Epoch: [7][1800/2759] Batch Time 0.186 (0.230) Data Time 0.000 (0.056) Loss 3.1036 (3.3902) Epoch: [7][2000/2759] Batch Time 0.170 (0.230) Data Time 0.000 (0.056) Loss 2.5111 (3.3836) Epoch: [7][2200/2759] Batch Time 0.162 (0.229) Data Time 0.033 (0.055) Loss 2.5709 (3.3774) Epoch: [7][2400/2759] Batch Time 0.193 (0.229) Data Time 0.000 (0.055) Loss 4.5197 (3.3669) Epoch: [7][2600/2759] Batch Time 0.222 (0.229) Data Time 0.000 (0.055) Loss 3.5873 (3.3599) Epoch: [8][0/2759] Batch Time 1.669 (1.669) Data Time 1.430 (1.430) Loss 3.2858 (3.2858) Epoch: [8][200/2759] Batch Time 0.179 (0.242) Data Time 0.003 (0.065) Loss 3.9765 (3.3174) Epoch: [8][400/2759] Batch Time 0.510 (0.235) Data Time 0.356 (0.060) Loss 3.6382 (3.2496) Epoch: [8][600/2759] Batch Time 0.294 (0.232) Data Time 0.072 (0.057) Loss 2.9859 (3.2600) Epoch: [8][800/2759] Batch Time 0.186 (0.230) Data Time 0.000 (0.055) Loss 3.3603 (3.2437) Epoch: [8][1000/2759] Batch Time 0.169 (0.230) Data Time 0.000 (0.055) Loss 3.6212 (3.2495) Epoch: [8][1200/2759] Batch Time 0.149 (0.229) Data Time 0.000 (0.056) Loss 2.5735 (3.2391) Epoch: [8][1400/2759] Batch Time 0.194 (0.230) Data Time 0.000 (0.056) Loss 2.2927 (3.2434) Epoch: [8][1600/2759] Batch Time 0.168 (0.230) Data Time 0.001 (0.057) Loss 3.0288 (3.2502) Epoch: [8][1800/2759] Batch Time 0.149 (0.230) Data Time 0.000 (0.057) Loss 3.3649 (3.2448) Epoch: [8][2000/2759] Batch Time 0.188 (0.230) Data Time 0.000 (0.057) Loss 3.3015 (3.2460) Epoch: [8][2200/2759] Batch Time 0.188 (0.230) Data Time 0.000 (0.057) Loss 2.4819 (3.2457) Epoch: [8][2400/2759] Batch Time 0.619 (0.230) Data Time 0.425 (0.057) Loss 2.8338 (3.2412) Epoch: [8][2600/2759] Batch Time 0.372 (0.229) Data Time 0.198 (0.056) Loss 3.0072 (3.2370) DECAYING learning rate. The new LR is 0.000010 Epoch: [9][0/2759] Batch Time 1.481 (1.481) Data Time 1.231 (1.231) Loss 3.5195 (3.5195) Epoch: [9][200/2759] Batch Time 0.192 (0.231) Data Time 0.000 (0.054) Loss 2.7245 (3.1407) Epoch: [9][400/2759] Batch Time 0.172 (0.232) Data Time 0.000 (0.056) Loss 2.3452 (3.1570) Epoch: [9][600/2759] Batch Time 0.217 (0.232) Data Time 0.000 (0.056) Loss 3.6985 (3.1703) Epoch: [9][800/2759] Batch Time 0.172 (0.231) Data Time 0.000 (0.057) Loss 4.1321 (3.1595) Epoch: [9][1000/2759] Batch Time 0.174 (0.231) Data Time 0.000 (0.056) Loss 3.0362 (3.1685) Epoch: [9][1200/2759] Batch Time 0.442 (0.230) Data Time 0.248 (0.056) Loss 3.1516 (3.1587) Epoch: [9][1400/2759] Batch Time 0.241 (0.229) Data Time 0.000 (0.055) Loss 3.3524 (3.1583) Epoch: [9][1600/2759] Batch Time 0.296 (0.230) Data Time 0.099 (0.055) Loss 3.5819 (3.1587) Epoch: [9][1800/2759] Batch Time 0.166 (0.230) Data Time 0.000 (0.055) Loss 3.2270 (3.1609) Epoch: [9][2000/2759] Batch Time 0.168 (0.229) Data Time 0.000 (0.054) Loss 3.7098 (3.1628) Epoch: [9][2200/2759] Batch Time 0.172 (0.229) Data Time 0.000 (0.054) Loss 2.3072 (3.1688) Epoch: [9][2400/2759] Batch Time 0.141 (0.229) Data Time 0.000 (0.055) Loss 3.5119 (3.1731) Epoch: [9][2600/2759] Batch Time 0.156 (0.229) Data Time 0.000 (0.054) Loss 3.1305 (3.1698) time elapsed: 6975.944407701492
This should be run after modifyng the training loop to use a learning rate scheduler.
start_time = time.time()
train_SSD(base_type='VGG', lr_type='pytorch_scheduler')
end_time = time.time()
print("time elapsed:", end_time - start_time)
Number of Epochs to train: 10 Loaded base model.
/usr/local/lib/python3.7/dist-packages/torch/nn/_reduction.py:44: UserWarning: size_average and reduce args will be deprecated, please use reduction='none' instead. warnings.warn(warning.format(ret))
Epoch: [0][0/2759] Batch Time 126.295 (126.295) Data Time 123.017 (123.017) Loss 23.3069 (23.3069) Epoch: [0][200/2759] Batch Time 2.784 (1.322) Data Time 2.473 (1.062) Loss 6.4519 (10.7817) Epoch: [0][400/2759] Batch Time 2.270 (1.017) Data Time 1.942 (0.763) Loss 6.7676 (9.0850) Epoch: [0][600/2759] Batch Time 1.918 (0.915) Data Time 1.678 (0.661) Loss 5.9183 (8.2283) Epoch: [0][800/2759] Batch Time 1.003 (0.861) Data Time 0.758 (0.607) Loss 6.3973 (7.7605) Epoch: [0][1000/2759] Batch Time 0.247 (0.830) Data Time 0.000 (0.575) Loss 6.1305 (7.4524) Epoch: [0][1200/2759] Batch Time 0.994 (0.808) Data Time 0.746 (0.553) Loss 5.4685 (7.2168) Epoch: [0][1400/2759] Batch Time 0.252 (0.799) Data Time 0.000 (0.545) Loss 6.3343 (7.0401) Epoch: [0][1600/2759] Batch Time 0.249 (0.788) Data Time 0.000 (0.533) Loss 4.9598 (6.9061) Epoch: [0][1800/2759] Batch Time 0.252 (0.780) Data Time 0.000 (0.525) Loss 5.0622 (6.7854) Epoch: [0][2000/2759] Batch Time 2.098 (0.774) Data Time 1.841 (0.518) Loss 4.9490 (6.6860) Epoch: [0][2200/2759] Batch Time 0.264 (0.767) Data Time 0.000 (0.511) Loss 6.0652 (6.5916) Epoch: [0][2400/2759] Batch Time 2.319 (0.762) Data Time 1.984 (0.506) Loss 4.0999 (6.4969) Epoch: [0][2600/2759] Batch Time 0.773 (0.756) Data Time 0.529 (0.500) Loss 5.9469 (6.4153) Epoch: [1][0/2759] Batch Time 1.737 (1.737) Data Time 1.388 (1.388) Loss 5.9838 (5.9838) Epoch: [1][200/2759] Batch Time 0.275 (0.281) Data Time 0.000 (0.008) Loss 5.2495 (5.2815) Epoch: [1][400/2759] Batch Time 0.254 (0.276) Data Time 0.001 (0.004) Loss 5.3906 (5.2407) Epoch: [1][600/2759] Batch Time 0.274 (0.274) Data Time 0.000 (0.003) Loss 5.0140 (5.1926) Epoch: [1][800/2759] Batch Time 0.263 (0.273) Data Time 0.000 (0.003) Loss 5.5408 (5.1602) Epoch: [1][1000/2759] Batch Time 0.266 (0.273) Data Time 0.000 (0.002) Loss 5.1210 (5.1357) Epoch: [1][1200/2759] Batch Time 0.258 (0.273) Data Time 0.000 (0.002) Loss 4.7762 (5.0980) Epoch: [1][1400/2759] Batch Time 0.314 (0.272) Data Time 0.004 (0.002) Loss 4.3887 (5.0710) Epoch: [1][1600/2759] Batch Time 0.262 (0.272) Data Time 0.000 (0.002) Loss 5.2455 (5.0359) Epoch: [1][1800/2759] Batch Time 0.264 (0.272) Data Time 0.000 (0.002) Loss 4.7617 (5.0090) Epoch: [1][2000/2759] Batch Time 0.258 (0.272) Data Time 0.000 (0.002) Loss 4.5565 (4.9857) Epoch: [1][2200/2759] Batch Time 0.254 (0.272) Data Time 0.000 (0.001) Loss 5.3597 (4.9548) Epoch: [1][2400/2759] Batch Time 0.255 (0.272) Data Time 0.000 (0.001) Loss 5.6574 (4.9318) Epoch: [1][2600/2759] Batch Time 0.254 (0.271) Data Time 0.000 (0.001) Loss 3.8380 (4.9038) Epoch: [2][0/2759] Batch Time 1.443 (1.443) Data Time 1.079 (1.079) Loss 3.9161 (3.9161) Epoch: [2][200/2759] Batch Time 0.269 (0.278) Data Time 0.007 (0.006) Loss 5.3546 (4.4785) Epoch: [2][400/2759] Batch Time 0.261 (0.274) Data Time 0.000 (0.003) Loss 4.4224 (4.4599) Epoch: [2][600/2759] Batch Time 0.278 (0.274) Data Time 0.000 (0.002) Loss 4.4888 (4.4537) Epoch: [2][800/2759] Batch Time 0.278 (0.273) Data Time 0.000 (0.002) Loss 4.6680 (4.4264) Epoch: [2][1000/2759] Batch Time 0.254 (0.273) Data Time 0.000 (0.002) Loss 5.0808 (4.4195) Epoch: [2][1200/2759] Batch Time 0.270 (0.273) Data Time 0.000 (0.002) Loss 4.0437 (4.4146) Epoch: [2][1400/2759] Batch Time 0.273 (0.273) Data Time 0.000 (0.001) Loss 4.1603 (4.4009) Epoch: [2][1600/2759] Batch Time 0.284 (0.272) Data Time 0.000 (0.001) Loss 4.6247 (4.3817) Epoch: [2][1800/2759] Batch Time 0.274 (0.272) Data Time 0.000 (0.001) Loss 4.6660 (4.3731) Epoch: [2][2000/2759] Batch Time 0.271 (0.272) Data Time 0.000 (0.001) Loss 4.2470 (4.3549) Epoch: [2][2200/2759] Batch Time 0.258 (0.272) Data Time 0.000 (0.001) Loss 4.0770 (4.3387) Epoch: [2][2400/2759] Batch Time 0.264 (0.272) Data Time 0.000 (0.001) Loss 3.7620 (4.3239) Epoch: [2][2600/2759] Batch Time 0.280 (0.272) Data Time 0.000 (0.001) Loss 4.4740 (4.3093) Epoch: [3][0/2759] Batch Time 1.244 (1.244) Data Time 0.871 (0.871) Loss 4.3053 (4.3053) Epoch: [3][200/2759] Batch Time 0.303 (0.282) Data Time 0.001 (0.006) Loss 4.6641 (4.0192) Epoch: [3][400/2759] Batch Time 0.294 (0.278) Data Time 0.000 (0.003) Loss 3.7765 (4.0386) Epoch: [3][600/2759] Batch Time 0.263 (0.277) Data Time 0.000 (0.003) Loss 4.4109 (4.0385) Epoch: [3][800/2759] Batch Time 0.288 (0.276) Data Time 0.000 (0.002) Loss 4.2864 (4.0266) Epoch: [3][1000/2759] Batch Time 0.264 (0.275) Data Time 0.000 (0.002) Loss 4.0941 (4.0127) Epoch: [3][1200/2759] Batch Time 0.274 (0.275) Data Time 0.000 (0.002) Loss 4.7895 (3.9992) Epoch: [3][1400/2759] Batch Time 0.259 (0.274) Data Time 0.000 (0.002) Loss 3.8122 (3.9936) Epoch: [3][1600/2759] Batch Time 0.263 (0.274) Data Time 0.000 (0.001) Loss 3.6413 (3.9965) Epoch: [3][1800/2759] Batch Time 0.278 (0.274) Data Time 0.000 (0.001) Loss 4.2709 (3.9961) Epoch: [3][2000/2759] Batch Time 0.281 (0.273) Data Time 0.000 (0.001) Loss 4.7888 (3.9900) Epoch: [3][2200/2759] Batch Time 0.271 (0.273) Data Time 0.000 (0.001) Loss 5.0446 (3.9827) Epoch: [3][2400/2759] Batch Time 0.279 (0.273) Data Time 0.000 (0.001) Loss 4.1812 (3.9738) Epoch: [3][2600/2759] Batch Time 0.271 (0.273) Data Time 0.000 (0.001) Loss 3.5292 (3.9700) Epoch: [4][0/2759] Batch Time 1.510 (1.510) Data Time 1.140 (1.140) Loss 4.1740 (4.1740) Epoch: [4][200/2759] Batch Time 0.269 (0.280) Data Time 0.000 (0.006) Loss 2.8392 (3.7427) Epoch: [4][400/2759] Batch Time 0.259 (0.277) Data Time 0.000 (0.004) Loss 3.4193 (3.8059) Epoch: [4][600/2759] Batch Time 0.283 (0.277) Data Time 0.000 (0.003) Loss 5.8802 (3.8055) Epoch: [4][800/2759] Batch Time 0.270 (0.276) Data Time 0.000 (0.002) Loss 3.1429 (3.7834) Epoch: [4][1000/2759] Batch Time 0.270 (0.275) Data Time 0.000 (0.002) Loss 3.3515 (3.7816) Epoch: [4][1200/2759] Batch Time 0.266 (0.275) Data Time 0.000 (0.002) Loss 3.5771 (3.7801) Epoch: [4][1400/2759] Batch Time 0.300 (0.275) Data Time 0.000 (0.002) Loss 3.3577 (3.7844) Epoch: [4][1600/2759] Batch Time 0.259 (0.275) Data Time 0.000 (0.002) Loss 3.9478 (3.7834) Epoch: [4][1800/2759] Batch Time 0.288 (0.275) Data Time 0.002 (0.002) Loss 3.8504 (3.7806) Epoch: [4][2000/2759] Batch Time 0.287 (0.275) Data Time 0.000 (0.001) Loss 3.7902 (3.7825) Epoch: [4][2200/2759] Batch Time 0.262 (0.274) Data Time 0.000 (0.001) Loss 4.3665 (3.7793) Epoch: [4][2400/2759] Batch Time 0.262 (0.274) Data Time 0.000 (0.001) Loss 3.1017 (3.7750) Epoch: [4][2600/2759] Batch Time 0.279 (0.274) Data Time 0.005 (0.001) Loss 3.6945 (3.7656) Epoch: [5][0/2759] Batch Time 1.265 (1.265) Data Time 0.875 (0.875) Loss 3.4442 (3.4442) Epoch: [5][200/2759] Batch Time 0.267 (0.283) Data Time 0.000 (0.009) Loss 3.7996 (3.6611) Epoch: [5][400/2759] Batch Time 0.251 (0.278) Data Time 0.000 (0.005) Loss 3.2027 (3.6890) Epoch: [5][600/2759] Batch Time 0.279 (0.277) Data Time 0.005 (0.004) Loss 3.1178 (3.6798) Epoch: [5][800/2759] Batch Time 0.273 (0.276) Data Time 0.000 (0.003) Loss 3.8050 (3.6740) Epoch: [5][1000/2759] Batch Time 0.267 (0.275) Data Time 0.000 (0.002) Loss 3.5171 (3.6650) Epoch: [5][1200/2759] Batch Time 0.306 (0.275) Data Time 0.000 (0.002) Loss 3.5423 (3.6473) Epoch: [5][1400/2759] Batch Time 0.286 (0.275) Data Time 0.002 (0.002) Loss 3.2250 (3.6453) Epoch: [5][1600/2759] Batch Time 0.273 (0.275) Data Time 0.000 (0.002) Loss 3.0983 (3.6486) Epoch: [5][1800/2759] Batch Time 0.280 (0.275) Data Time 0.005 (0.002) Loss 2.9773 (3.6409) Epoch: [5][2000/2759] Batch Time 0.258 (0.274) Data Time 0.000 (0.002) Loss 3.5216 (3.6419) Epoch: [5][2200/2759] Batch Time 0.257 (0.274) Data Time 0.000 (0.002) Loss 3.8933 (3.6443) Epoch: [5][2400/2759] Batch Time 0.279 (0.274) Data Time 0.000 (0.001) Loss 3.6212 (3.6311) Epoch: [5][2600/2759] Batch Time 0.253 (0.274) Data Time 0.000 (0.001) Loss 4.6605 (3.6265) Epoch: [6][0/2759] Batch Time 1.396 (1.396) Data Time 1.037 (1.037) Loss 3.9322 (3.9322) Epoch: [6][200/2759] Batch Time 0.272 (0.280) Data Time 0.000 (0.006) Loss 2.9021 (3.5271) Epoch: [6][400/2759] Batch Time 0.255 (0.277) Data Time 0.005 (0.003) Loss 2.8927 (3.5392) Epoch: [6][600/2759] Batch Time 0.255 (0.276) Data Time 0.000 (0.003) Loss 3.7603 (3.5421) Epoch: [6][800/2759] Batch Time 0.271 (0.275) Data Time 0.000 (0.002) Loss 3.0422 (3.5290) Epoch: [6][1000/2759] Batch Time 0.286 (0.275) Data Time 0.000 (0.002) Loss 2.8022 (3.5413) Epoch: [6][1200/2759] Batch Time 0.274 (0.275) Data Time 0.000 (0.002) Loss 3.6551 (3.5411) Epoch: [6][1400/2759] Batch Time 0.275 (0.275) Data Time 0.000 (0.002) Loss 4.3799 (3.5266) Epoch: [6][1600/2759] Batch Time 0.303 (0.275) Data Time 0.000 (0.002) Loss 3.4515 (3.5212) Epoch: [6][1800/2759] Batch Time 0.264 (0.275) Data Time 0.000 (0.001) Loss 2.8264 (3.5230) Epoch: [6][2000/2759] Batch Time 0.259 (0.275) Data Time 0.000 (0.001) Loss 3.7481 (3.5182) Epoch: [6][2200/2759] Batch Time 0.311 (0.275) Data Time 0.000 (0.001) Loss 2.5754 (3.5195) Epoch: [6][2400/2759] Batch Time 0.265 (0.275) Data Time 0.000 (0.001) Loss 2.9873 (3.5187) Epoch: [6][2600/2759] Batch Time 0.279 (0.275) Data Time 0.000 (0.001) Loss 2.2086 (3.5113) Epoch: [7][0/2759] Batch Time 1.322 (1.322) Data Time 0.954 (0.954) Loss 4.0906 (4.0906) Epoch: [7][200/2759] Batch Time 0.275 (0.283) Data Time 0.000 (0.007) Loss 2.4941 (3.2642) Epoch: [7][400/2759] Batch Time 0.276 (0.279) Data Time 0.000 (0.004) Loss 2.4586 (3.2369) Epoch: [7][600/2759] Batch Time 0.276 (0.277) Data Time 0.000 (0.003) Loss 3.6148 (3.2076) Epoch: [7][800/2759] Batch Time 0.278 (0.276) Data Time 0.000 (0.002) Loss 2.5480 (3.1949) Epoch: [7][1000/2759] Batch Time 0.285 (0.276) Data Time 0.000 (0.002) Loss 2.6727 (3.1907) Epoch: [7][1200/2759] Batch Time 0.285 (0.275) Data Time 0.000 (0.002) Loss 2.5265 (3.1861) Epoch: [7][1400/2759] Batch Time 0.269 (0.275) Data Time 0.000 (0.002) Loss 3.2630 (3.1747) Epoch: [7][1600/2759] Batch Time 0.301 (0.275) Data Time 0.000 (0.002) Loss 2.9144 (3.1683) Epoch: [7][1800/2759] Batch Time 0.294 (0.275) Data Time 0.001 (0.002) Loss 3.7482 (3.1647) Epoch: [7][2000/2759] Batch Time 0.278 (0.275) Data Time 0.000 (0.002) Loss 3.7788 (3.1612) Epoch: [7][2200/2759] Batch Time 0.279 (0.275) Data Time 0.000 (0.001) Loss 2.7517 (3.1550) Epoch: [7][2400/2759] Batch Time 0.278 (0.275) Data Time 0.000 (0.001) Loss 2.8636 (3.1498) Epoch: [7][2600/2759] Batch Time 0.290 (0.274) Data Time 0.000 (0.001) Loss 2.3057 (3.1490) Epoch: [8][0/2759] Batch Time 1.344 (1.344) Data Time 0.974 (0.974) Loss 3.5138 (3.5138) Epoch: [8][200/2759] Batch Time 0.265 (0.283) Data Time 0.000 (0.007) Loss 2.6673 (3.0251) Epoch: [8][400/2759] Batch Time 0.273 (0.279) Data Time 0.000 (0.004) Loss 3.1610 (3.0332) Epoch: [8][600/2759] Batch Time 0.289 (0.277) Data Time 0.000 (0.003) Loss 3.0896 (3.0583) Epoch: [8][800/2759] Batch Time 0.265 (0.276) Data Time 0.000 (0.002) Loss 2.0746 (3.0628) Epoch: [8][1000/2759] Batch Time 0.286 (0.276) Data Time 0.000 (0.002) Loss 2.4474 (3.0618) Epoch: [8][1200/2759] Batch Time 0.263 (0.276) Data Time 0.000 (0.002) Loss 2.7454 (3.0699) Epoch: [8][1400/2759] Batch Time 0.262 (0.275) Data Time 0.001 (0.002) Loss 3.0548 (3.0737) Epoch: [8][1600/2759] Batch Time 0.272 (0.275) Data Time 0.000 (0.002) Loss 3.7653 (3.0780) Epoch: [8][1800/2759] Batch Time 0.271 (0.275) Data Time 0.000 (0.002) Loss 4.0378 (3.0758) Epoch: [8][2000/2759] Batch Time 0.264 (0.275) Data Time 0.000 (0.002) Loss 3.1043 (3.0807) Epoch: [8][2200/2759] Batch Time 0.258 (0.275) Data Time 0.000 (0.001) Loss 3.0007 (3.0823) Epoch: [8][2400/2759] Batch Time 0.297 (0.275) Data Time 0.000 (0.001) Loss 2.4156 (3.0830) Epoch: [8][2600/2759] Batch Time 0.265 (0.274) Data Time 0.001 (0.001) Loss 2.4196 (3.0780) Epoch: [9][0/2759] Batch Time 1.251 (1.251) Data Time 0.911 (0.911) Loss 3.5207 (3.5207) Epoch: [9][200/2759] Batch Time 0.274 (0.283) Data Time 0.000 (0.008) Loss 3.1686 (3.0869) Epoch: [9][400/2759] Batch Time 0.253 (0.278) Data Time 0.000 (0.004) Loss 3.1832 (3.0682) Epoch: [9][600/2759] Batch Time 0.257 (0.275) Data Time 0.000 (0.003) Loss 2.7437 (3.0851) Epoch: [9][800/2759] Batch Time 0.297 (0.275) Data Time 0.000 (0.003) Loss 3.2544 (3.0686) Epoch: [9][1000/2759] Batch Time 0.297 (0.275) Data Time 0.010 (0.002) Loss 3.1543 (3.0609) Epoch: [9][1200/2759] Batch Time 0.270 (0.274) Data Time 0.000 (0.002) Loss 2.5052 (3.0633) Epoch: [9][1400/2759] Batch Time 0.281 (0.275) Data Time 0.000 (0.002) Loss 2.1103 (3.0525) Epoch: [9][1600/2759] Batch Time 0.266 (0.274) Data Time 0.000 (0.002) Loss 4.0975 (3.0533) Epoch: [9][1800/2759] Batch Time 0.269 (0.274) Data Time 0.000 (0.002) Loss 2.9278 (3.0539) Epoch: [9][2000/2759] Batch Time 0.296 (0.274) Data Time 0.000 (0.002) Loss 3.3743 (3.0557) Epoch: [9][2200/2759] Batch Time 0.274 (0.274) Data Time 0.000 (0.002) Loss 3.3863 (3.0539) Epoch: [9][2400/2759] Batch Time 0.290 (0.274) Data Time 0.000 (0.002) Loss 3.6861 (3.0480) Epoch: [9][2600/2759] Batch Time 0.285 (0.274) Data Time 0.000 (0.001) Loss 3.3688 (3.0444) time elapsed: 8888.30541729927
Now let's run the eval code, it should take about 30 minutes per model.
from utils import *
# from datasets import PascalVOCDataset
from tqdm import tqdm
from pprint import PrettyPrinter
# Good formatting when printing the APs for each class and mAP
pp = PrettyPrinter()
# Parameters
data_folder = './'
keep_difficult = True # difficult ground truth objects must always be considered in mAP calculation, because these objects DO exist!
batch_size = 64
workers = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = './checkpoint_ssd300_VGG.pth.tar'
# Load model checkpoint that is to be evaluated
checkpoint = torch.load(checkpoint)
model = checkpoint['model']
model = model.to(device)
# Switch to eval mode
model.eval()
# Load test data
test_dataset = PascalVOCDataset(data_folder,
split='test',
keep_difficult=keep_difficult)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
collate_fn=test_dataset.collate_fn, num_workers=workers, pin_memory=True)
def evaluate(test_loader, model):
"""
Evaluate.
:param test_loader: DataLoader for test data
:param model: model
"""
# Make sure it's in eval mode
model.eval()
# Lists to store detected and true boxes, labels, scores
det_boxes = list()
det_labels = list()
det_scores = list()
true_boxes = list()
true_labels = list()
true_difficulties = list() # it is necessary to know which objects are 'difficult', see 'calculate_mAP' in utils.py
with torch.no_grad():
# Batches
for i, (images, boxes, labels, difficulties) in enumerate(tqdm(test_loader, desc='Evaluating')):
images = images.to(device) # (N, 3, 300, 300)
# Forward prop.
predicted_locs, predicted_scores = model(images)
# Detect objects in SSD output
det_boxes_batch, det_labels_batch, det_scores_batch = model.detect_objects(predicted_locs, predicted_scores,
min_score=0.01, max_overlap=0.45,
top_k=200)
# Evaluation MUST be at min_score=0.01, max_overlap=0.45, top_k=200 for fair comparision with the paper's results and other repos
# Store this batch's results for mAP calculation
boxes = [b.to(device) for b in boxes]
labels = [l.to(device) for l in labels]
difficulties = [d.to(device) for d in difficulties]
det_boxes.extend(det_boxes_batch)
det_labels.extend(det_labels_batch)
det_scores.extend(det_scores_batch)
true_boxes.extend(boxes)
true_labels.extend(labels)
true_difficulties.extend(difficulties)
# Calculate mAP
APs, mAP = calculate_mAP(det_boxes, det_labels, det_scores, true_boxes, true_labels, true_difficulties)
# Print AP for each class
pp.pprint(APs)
print('\nMean Average Precision (mAP): %.3f' % mAP)
checkpoint = './checkpoint_ssd300_VGG.pth.tar'
# Load model checkpoint that is to be evaluated
checkpoint = torch.load(checkpoint)
model = checkpoint['model']
model = model.to(device)
# Switch to eval mode
model.eval()
# Load test data
test_dataset = PascalVOCDataset(data_folder,
split='test',
keep_difficult=keep_difficult)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
collate_fn=test_dataset.collate_fn, num_workers=workers, pin_memory=True)
evaluate(test_loader, model)
Evaluating: 0%| | 0/78 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:183: UserWarning: indexing with dtype torch.uint8 is now deprecated, please use a dtype torch.bool instead. (Triggered internally at /pytorch/aten/src/ATen/native/IndexingUtils.h:25.) /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:185: UserWarning: indexing with dtype torch.uint8 is now deprecated, please use a dtype torch.bool instead. (Triggered internally at /pytorch/aten/src/ATen/native/IndexingUtils.h:25.) Evaluating: 100%|██████████| 78/78 [20:47<00:00, 16.00s/it]
{'aeroplane': 0.7069304585456848,
'bicycle': 0.7526691555976868,
'bird': 0.615410327911377,
'boat': 0.525149941444397,
'bottle': 0.26577845215797424,
'bus': 0.768818199634552,
'car': 0.7888773083686829,
'cat': 0.833739161491394,
'chair': 0.38870084285736084,
'cow': 0.7115549445152283,
'diningtable': 0.6042712926864624,
'dog': 0.806510865688324,
'horse': 0.7746341824531555,
'motorbike': 0.7582966685295105,
'person': 0.6959766149520874,
'pottedplant': 0.2745964825153351,
'sheep': 0.664925754070282,
'sofa': 0.6730805039405823,
'train': 0.7699201703071594,
'tvmonitor': 0.6671115159988403}
Mean Average Precision (mAP): 0.652
checkpoint = './checkpoint_ssd300_ResNet.pth.tar'
# Load model checkpoint that is to be evaluated
checkpoint = torch.load(checkpoint)
model = checkpoint['model']
model = model.to(device)
# Switch to eval mode
model.eval()
# Load test data
test_dataset = PascalVOCDataset(data_folder,
split='test',
keep_difficult=keep_difficult)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
collate_fn=test_dataset.collate_fn, num_workers=workers, pin_memory=True)
evaluate(test_loader, model)
Evaluating: 0%| | 0/78 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:183: UserWarning: indexing with dtype torch.uint8 is now deprecated, please use a dtype torch.bool instead. (Triggered internally at /pytorch/aten/src/ATen/native/IndexingUtils.h:25.) /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:185: UserWarning: indexing with dtype torch.uint8 is now deprecated, please use a dtype torch.bool instead. (Triggered internally at /pytorch/aten/src/ATen/native/IndexingUtils.h:25.) Evaluating: 100%|██████████| 78/78 [20:01<00:00, 15.40s/it]
{'aeroplane': 0.667491614818573,
'bicycle': 0.7133117318153381,
'bird': 0.6183592081069946,
'boat': 0.45969194173812866,
'bottle': 0.20122459530830383,
'bus': 0.7040727734565735,
'car': 0.7132814526557922,
'cat': 0.8372061848640442,
'chair': 0.3496056795120239,
'cow': 0.6219093799591064,
'diningtable': 0.5836526155471802,
'dog': 0.7992353439331055,
'horse': 0.7906227111816406,
'motorbike': 0.7182757258415222,
'person': 0.6065657734870911,
'pottedplant': 0.24183790385723114,
'sheep': 0.5955784320831299,
'sofa': 0.6600207090377808,
'train': 0.7756168246269226,
'tvmonitor': 0.6065837740898132}
Mean Average Precision (mAP): 0.613
checkpoint = './checkpoint_ssd300_VGG_scheduler.pth.tar'
# Load model checkpoint that is to be evaluated
checkpoint = torch.load(checkpoint)
model = checkpoint['model']
model = model.to(device)
# Switch to eval mode
model.eval()
# Load test data
test_dataset = PascalVOCDataset(data_folder,
split='test',
keep_difficult=keep_difficult)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
collate_fn=test_dataset.collate_fn, num_workers=workers, pin_memory=True)
evaluate(test_loader, model)
Evaluating: 0%| | 0/78 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:183: UserWarning: indexing with dtype torch.uint8 is now deprecated, please use a dtype torch.bool instead. (Triggered internally at /pytorch/aten/src/ATen/native/IndexingUtils.h:25.) /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:185: UserWarning: indexing with dtype torch.uint8 is now deprecated, please use a dtype torch.bool instead. (Triggered internally at /pytorch/aten/src/ATen/native/IndexingUtils.h:25.) Evaluating: 100%|██████████| 78/78 [21:47<00:00, 16.76s/it]
{'aeroplane': 0.7154119610786438,
'bicycle': 0.7397521734237671,
'bird': 0.6496826410293579,
'boat': 0.5451831221580505,
'bottle': 0.2889726161956787,
'bus': 0.7461641430854797,
'car': 0.7893444895744324,
'cat': 0.8354564309120178,
'chair': 0.3560791015625,
'cow': 0.7012356519699097,
'diningtable': 0.5439050793647766,
'dog': 0.7952905297279358,
'horse': 0.7889485955238342,
'motorbike': 0.7635747790336609,
'person': 0.6894237399101257,
'pottedplant': 0.303062379360199,
'sheep': 0.6825856566429138,
'sofa': 0.6492593288421631,
'train': 0.7860965132713318,
'tvmonitor': 0.6727997660636902}
Mean Average Precision (mAP): 0.652
And lastly let's view some images with our detections!
from torchvision import transforms
from utils import *
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load model checkpoint
checkpoint = 'checkpoint_ssd300_VGG.pth.tar'
checkpoint = torch.load(checkpoint)
start_epoch = checkpoint['epoch'] + 1
print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
model = checkpoint['model']
model = model.to(device)
model.eval()
# Transforms
resize = transforms.Resize((300, 300))
to_tensor = transforms.ToTensor()
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
def detect(original_image, min_score, max_overlap, top_k, suppress=None):
"""
Detect objects in an image with a trained SSD300, and visualize the results.
:param original_image: image, a PIL Image
:param min_score: minimum threshold for a detected box to be considered a match for a certain class
:param max_overlap: maximum overlap two boxes can have so that the one with the lower score is not suppressed via Non-Maximum Suppression (NMS)
:param top_k: if there are a lot of resulting detection across all classes, keep only the top 'k'
:param suppress: classes that you know for sure cannot be in the image or you do not want in the image, a list
:return: annotated image, a PIL Image
"""
# Transform
image = normalize(to_tensor(resize(original_image)))
# Move to default device
image = image.to(device)
# Forward prop.
predicted_locs, predicted_scores = model(image.unsqueeze(0))
# Detect objects in SSD output
det_boxes, det_labels, det_scores = model.detect_objects(predicted_locs, predicted_scores, min_score=min_score,
max_overlap=max_overlap, top_k=top_k)
# Move detections to the CPU
det_boxes = det_boxes[0].to('cpu')
# Transform to original image dimensions
original_dims = torch.FloatTensor(
[original_image.width, original_image.height, original_image.width, original_image.height]).unsqueeze(0)
det_boxes = det_boxes * original_dims
# Decode class integer labels
det_labels = [rev_label_map[l] for l in det_labels[0].to('cpu').tolist()]
# If no objects found, the detected labels will be set to ['0.'], i.e. ['background'] in SSD300.detect_objects() in model.py
if det_labels == ['background']:
# Just return original image
return original_image
# Annotate
annotated_image = original_image
draw = ImageDraw.Draw(annotated_image)
font = ImageFont.load_default() # ImageFont.truetype("./calibril.ttf", 15)
# Suppress specific classes, if needed
for i in range(det_boxes.size(0)):
if suppress is not None:
if det_labels[i] in suppress:
continue
# Boxes
box_location = det_boxes[i].tolist()
draw.rectangle(xy=box_location, outline=label_color_map[det_labels[i]])
draw.rectangle(xy=[l + 1. for l in box_location], outline=label_color_map[
det_labels[i]]) # a second rectangle at an offset of 1 pixel to increase line thickness
# draw.rectangle(xy=[l + 2. for l in box_location], outline=label_color_map[
# det_labels[i]]) # a third rectangle at an offset of 1 pixel to increase line thickness
# draw.rectangle(xy=[l + 3. for l in box_location], outline=label_color_map[
# det_labels[i]]) # a fourth rectangle at an offset of 1 pixel to increase line thickness
# Text
text_size = font.getsize(det_labels[i].upper())
text_location = [box_location[0] + 2., box_location[1] - text_size[1]]
textbox_location = [box_location[0], box_location[1] - text_size[1], box_location[0] + text_size[0] + 4.,
box_location[1]]
draw.rectangle(xy=textbox_location, fill=label_color_map[det_labels[i]])
draw.text(xy=text_location, text=det_labels[i].upper(), fill='white',
font=font)
del draw
return annotated_image
relevant_images = [
'000012.jpg', # Car
'000014.jpg', # Car, Bus
'000026.jpg', # Car
'000038.jpg', # Cyclist
'000054.jpg', # Bus
'000091.jpg', # Vehicles parked, far from camera
'000111.jpg', # Cyclists in race, far from camera
'000129.jpg' # Cyclists in race, close to camera
]
for rel_img_file_name in relevant_images:
img_path = '/content/gdrive/MyDrive/Colab Notebooks/ece495_assignment4/VOC2007/JPEGImages/' + rel_img_file_name
original_image = Image.open(img_path, mode='r')
original_image = original_image.convert('RGB')
img = detect(original_image, min_score=0.2, max_overlap=0.5, top_k=200)
fig = plt.figure(figsize=(10,10))
ax1 = fig.add_subplot(1,1,1)
ax1.imshow(img)
Loaded checkpoint from epoch 10.
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:183: UserWarning: indexing with dtype torch.uint8 is now deprecated, please use a dtype torch.bool instead. (Triggered internally at /pytorch/aten/src/ATen/native/IndexingUtils.h:25.) /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:185: UserWarning: indexing with dtype torch.uint8 is now deprecated, please use a dtype torch.bool instead. (Triggered internally at /pytorch/aten/src/ATen/native/IndexingUtils.h:25.)